summaryrefslogtreecommitdiffstats
path: root/media/libvpx/vp8/common/arm/neon
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/vp8/common/arm/neon')
-rw-r--r--media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c699
-rw-r--r--media/libvpx/vp8/common/arm/neon/copymem_neon.c59
-rw-r--r--media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c42
-rw-r--r--media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c142
-rw-r--r--media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c25
-rw-r--r--media/libvpx/vp8/common/arm/neon/idct_blk_neon.c96
-rw-r--r--media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c62
-rw-r--r--media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c185
-rw-r--r--media/libvpx/vp8/common/arm/neon/iwalsh_neon.c102
-rw-r--r--media/libvpx/vp8/common/arm/neon/loopfilter_neon.c550
-rw-r--r--media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c111
-rw-r--r--media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c280
-rw-r--r--media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c625
-rw-r--r--media/libvpx/vp8/common/arm/neon/reconintra_neon.c210
-rw-r--r--media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c123
-rw-r--r--media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c1754
-rw-r--r--media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c1017
17 files changed, 6082 insertions, 0 deletions
diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
new file mode 100644
index 000000000..9824a3193
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const uint8_t bifilter4_coeff[8][2] = {
+ {128, 0},
+ {112, 16},
+ { 96, 32},
+ { 80, 48},
+ { 64, 64},
+ { 48, 80},
+ { 32, 96},
+ { 16, 112}
+};
+
+void vp8_bilinear_predict4x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
+ uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
+ uint8x16_t q1u8, q2u8;
+ uint16x8_t q1u16, q2u16;
+ uint16x8_t q7u16, q8u16, q9u16;
+ uint64x2_t q4u64, q5u64;
+ uint64x1_t d12u64;
+ uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ uint32x2_t d28u32 = vdup_n_u32(0);
+ uint32x2_t d29u32 = vdup_n_u32(0);
+ uint32x2_t d30u32 = vdup_n_u32(0);
+
+ d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
+ src_ptr += src_pixels_per_line;
+ d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
+ src_ptr += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
+ src_ptr += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
+ src_ptr += src_pixels_per_line;
+ d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
+ d28u8 = vreinterpret_u8_u32(d28u32);
+ d29u8 = vreinterpret_u8_u32(d29u32);
+ d30u8 = vreinterpret_u8_u32(d30u32);
+ } else {
+ d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d6u8 = vld1_u8(src_ptr);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
+ q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
+ d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
+ vreinterpret_u32_u8(vget_high_u8(q1u8)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
+ vreinterpret_u32_u8(vget_high_u8(q2u8)));
+ d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
+ vreinterpret_u32_u64(vget_high_u64(q4u64)));
+ d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+
+ q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+ q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+ q9u16 = vmull_u8(d6u8, d0u8);
+
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
+ q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
+
+ d28u8 = vqrshrn_n_u16(q7u16, 7);
+ d29u8 = vqrshrn_n_u16(q8u16, 7);
+ d30u8 = vqrshrn_n_u16(q9u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d28u8, d0u8);
+ q2u16 = vmull_u8(d29u8, d0u8);
+
+ d26u8 = vext_u8(d28u8, d29u8, 4);
+ d27u8 = vext_u8(d29u8, d30u8, 4);
+
+ q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+ }
+ return;
+}
+
+void vp8_bilinear_predict8x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
+ uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+ d26u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict8x8_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d30u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+
+ // first_pass filtering on the rest 5-line data
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d26u8 = vqrshrn_n_u16(q6u16, 7);
+ d27u8 = vqrshrn_n_u16(q7u16, 7);
+ d28u8 = vqrshrn_n_u16(q8u16, 7);
+ d29u8 = vqrshrn_n_u16(q9u16, 7);
+ d30u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d29u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+ q5u16 = vmull_u8(d26u8, d0u8);
+ q6u16 = vmull_u8(d27u8, d0u8);
+ q7u16 = vmull_u8(d28u8, d0u8);
+ q8u16 = vmull_u8(d29u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+ q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d9u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict16x16_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ int i;
+ unsigned char tmp[272];
+ unsigned char *tmpp;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+ uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ for (i = 4; i > 0 ; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 =vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ tmpp = tmp;
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp); tmpp += 16;
+ q13u8 = vld1q_u8(tmpp); tmpp += 16;
+ q14u8 = vld1q_u8(tmpp); tmpp += 16;
+ q15u8 = vld1q_u8(tmpp); tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+ }
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 000000000..deced115c
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 4; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x8_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 8; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem16x16_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+ uint8x16_t qtmp;
+
+ for (r = 0; r < 16; r++) {
+ qtmp = vld1q_u8(src);
+ vst1q_u8(dst, qtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
diff --git a/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 000000000..ad5f41d7d
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dc_only_idct_add_neon(
+ int16_t input_dc,
+ unsigned char *pred_ptr,
+ int pred_stride,
+ unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint16_t a1 = ((input_dc + 4) >> 3);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint8x8_t d2u8;
+ uint16x8_t q1u16;
+ uint16x8_t qAdd;
+
+ qAdd = vdupq_n_u16(a1);
+
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+ pred_ptr += pred_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+ dst_ptr += dst_stride;
+ }
+}
diff --git a/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 000000000..58e11922c
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 35468;
+
+void vp8_dequant_idct_add_neon(
+ int16_t *input,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int32x2_t d14, d15;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ int16x8_t q1, q2, q3, q4, q5, q6;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x2x2_t d2tmp0, d2tmp1;
+ int16x4x2_t d2tmp2, d2tmp3;
+
+ d14 = d15 = vdup_n_s32(0);
+
+ // load input
+ q3 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+ input += 8;
+ q4 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+
+ // load dq
+ q5 = vld1q_s16(dq);
+ dq += 8;
+ q6 = vld1q_s16(dq);
+
+ // load src from dst
+ dst0 = dst;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+ q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
+ vreinterpretq_u16_s16(q5)));
+ q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
+ vreinterpretq_u16_s16(q6)));
+
+ d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+ d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+ q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ q3 = vshrq_n_s16(q3, 1);
+ q4 = vshrq_n_s16(q4, 1);
+
+ q3 = vqaddq_s16(q3, q2);
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ // loop 2
+ q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+ d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+ q3 = vshrq_n_s16(q3, 1);
+ q4 = vshrq_n_s16(q4, 1);
+
+ q3 = vqaddq_s16(q3, q2);
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+ q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+ q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
+ vreinterpret_u8_s32(d14)));
+ q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
+ vreinterpret_u8_s32(d15)));
+
+ d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+ d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 1);
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 000000000..54e709dd3
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
+ int16x8x2_t qQ, qDQC, qDQ;
+
+ qQ = vld2q_s16(d->qcoeff);
+ qDQC = vld2q_s16(DQC);
+
+ qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+ qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+ vst2q_s16(d->dqcoeff, qDQ);
+}
diff --git a/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c
new file mode 100644
index 000000000..fb327a726
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_full_2x_neon(short *q, short *dq,
+ unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+ unsigned char *dst, int stride);
+
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dst, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+ }
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+ else
+ idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+ }
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs)
+{
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
+
+ q += 32;
+ dstu += 4*stride;
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
+
+ q += 32;
+
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
+
+ q += 32;
+ dstv += 4*stride;
+
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
+}
diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 000000000..967c32280
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+ int16_t *q,
+ int16_t dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int i, a0, a1;
+ int16x8x2_t q2Add;
+ int32x2_t d2s32, d4s32;
+ uint8x8_t d2u8, d4u8;
+ uint16x8_t q1u16, q2u16;
+
+ a0 = ((q[0] * dq) + 4) >> 3;
+ a1 = ((q[16] * dq) + 4) >> 3;
+ q[0] = q[16] = 0;
+ q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+ q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+ for (i = 0; i < 2; i++, dst += 4) {
+ dst0 = dst;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d2s32));
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d4s32));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+ d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+ d2s32 = vreinterpret_s32_u8(d2u8);
+ d4s32 = vreinterpret_s32_u8(d4u8);
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+ }
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 000000000..a60ed46b7
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+ int16_t *q,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0, *dst1;
+ int32x2_t d28, d29, d30, d31;
+ int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x4x2_t q2tmp0, q2tmp1;
+ int16x8x2_t q2tmp2, q2tmp3;
+ int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+ d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+ // load dq
+ q0 = vld1q_s16(dq);
+ dq += 8;
+ q1 = vld1q_s16(dq);
+
+ // load q
+ q2 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q3 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q4 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q5 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+
+ // load src from dst
+ dst0 = dst;
+ dst1 = dst + 4;
+ d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+ d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+ q2 = vmulq_s16(q2, q0);
+ q3 = vmulq_s16(q3, q1);
+ q4 = vmulq_s16(q4, q0);
+ q5 = vmulq_s16(q5, q1);
+
+ // vswp
+ dLow0 = vget_low_s16(q2);
+ dHigh0 = vget_high_s16(q2);
+ dLow1 = vget_low_s16(q4);
+ dHigh1 = vget_high_s16(q4);
+ q2 = vcombine_s16(dLow0, dLow1);
+ q4 = vcombine_s16(dHigh0, dHigh1);
+
+ dLow0 = vget_low_s16(q3);
+ dHigh0 = vget_high_s16(q3);
+ dLow1 = vget_low_s16(q5);
+ dHigh1 = vget_high_s16(q5);
+ q3 = vcombine_s16(dLow0, dLow1);
+ q5 = vcombine_s16(dHigh0, dHigh1);
+
+ q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+ q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+ q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+ q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+ q10 = vqaddq_s16(q2, q3);
+ q11 = vqsubq_s16(q2, q3);
+
+ q8 = vshrq_n_s16(q8, 1);
+ q9 = vshrq_n_s16(q9, 1);
+
+ q4 = vqaddq_s16(q4, q8);
+ q5 = vqaddq_s16(q5, q9);
+
+ q2 = vqsubq_s16(q6, q5);
+ q3 = vqaddq_s16(q7, q4);
+
+ q4 = vqaddq_s16(q10, q3);
+ q5 = vqaddq_s16(q11, q2);
+ q6 = vqsubq_s16(q11, q2);
+ q7 = vqsubq_s16(q10, q3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ // loop 2
+ q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+ q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+ q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+ q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+ q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+ q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+ q10 = vshrq_n_s16(q10, 1);
+ q11 = vshrq_n_s16(q11, 1);
+
+ q10 = vqaddq_s16(q2tmp2.val[1], q10);
+ q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+ q8 = vqsubq_s16(q8, q11);
+ q9 = vqaddq_s16(q9, q10);
+
+ q4 = vqaddq_s16(q2, q9);
+ q5 = vqaddq_s16(q3, q8);
+ q6 = vqsubq_s16(q3, q8);
+ q7 = vqsubq_s16(q2, q9);
+
+ q4 = vrshrq_n_s16(q4, 3);
+ q5 = vrshrq_n_s16(q5, 3);
+ q6 = vrshrq_n_s16(q6, 3);
+ q7 = vrshrq_n_s16(q7, 3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+ vreinterpret_u8_s32(d28)));
+ q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+ vreinterpret_u8_s32(d29)));
+ q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+ vreinterpret_u8_s32(d30)));
+ q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+ vreinterpret_u8_s32(d31)));
+
+ d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+ d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+ d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+ d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+ dst0 = dst;
+ dst1 = dst + 4;
+ vst1_lane_s32((int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ vst1_lane_s32((int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d31, 0);
+ vst1_lane_s32((int32_t *)dst1, d31, 1);
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c
new file mode 100644
index 000000000..6ea9dd712
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_inv_walsh4x4_neon(
+ int16_t *input,
+ int16_t *mb_dqcoeff) {
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+ int16x8_t qAdd3;
+
+ q0s16 = vld1q_s16(input);
+ q1s16 = vld1q_s16(input + 8);
+
+ // 1st for loop
+ d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+ d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
+ vreinterpret_s32_s16(vget_low_s16(q1s16)));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
+ vreinterpret_s32_s16(vget_high_s16(q1s16)));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
+ vreinterpret_s16_s32(v2tmp3.val[0]));
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
+ vreinterpret_s16_s32(v2tmp3.val[1]));
+
+ // 2nd for loop
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ qAdd3 = vdupq_n_s16(3);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ q0s16 = vaddq_s16(q0s16, qAdd3);
+ q1s16 = vaddq_s16(q1s16, qAdd3);
+
+ q0s16 = vshrq_n_s16(q0s16, 3);
+ q1s16 = vshrq_n_s16(q1s16, 3);
+
+ // store
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c
new file mode 100644
index 000000000..9d6807af7
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+static INLINE void vp8_loop_filter_neon(
+ uint8x16_t qblimit, // flimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q10 = vdupq_n_u8(3);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vmovl_u8(vget_low_u8(q10));
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ q0u8 = vdupq_n_u8(0x80);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ src -= (pitch * 5);
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ u -= (pitch * 5);
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+
+ v -= (pitch * 5);
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+ const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+ /*
+ * uint8x8x4_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ 20 21 22 23 | 24 25 26 27
+ 30 31 32 33 | 34 35 36 37
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 04 05 24 25
+ 02 03 22 23 | 06 07 26 27
+ 10 11 30 31 | 14 15 34 35
+ 12 13 32 33 | 16 17 36 37
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 04 14 24 34
+ 01 11 21 31 | 05 15 25 35
+ 02 12 22 32 | 06 16 26 36
+ 03 13 23 33 | 07 17 27 37
+ */
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+ vreinterpret_u16_u8(result.val[2]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+ vreinterpret_u16_u8(result.val[3]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#endif // VPX_INCOMPATIBLE_GCC
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s, *d;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s = src - 4;
+ d6 = vld1_u8(s);
+ s += pitch;
+ d8 = vld1_u8(s);
+ s += pitch;
+ d10 = vld1_u8(s);
+ s += pitch;
+ d12 = vld1_u8(s);
+ s += pitch;
+ d14 = vld1_u8(s);
+ s += pitch;
+ d16 = vld1_u8(s);
+ s += pitch;
+ d18 = vld1_u8(s);
+ s += pitch;
+ d20 = vld1_u8(s);
+ s += pitch;
+ d7 = vld1_u8(s);
+ s += pitch;
+ d9 = vld1_u8(s);
+ s += pitch;
+ d11 = vld1_u8(s);
+ s += pitch;
+ d13 = vld1_u8(s);
+ s += pitch;
+ d15 = vld1_u8(s);
+ s += pitch;
+ d17 = vld1_u8(s);
+ s += pitch;
+ d19 = vld1_u8(s);
+ s += pitch;
+ d21 = vld1_u8(s);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+
+ d = src - 2;
+ write_4x8(d, pitch, q4ResultL);
+ d += pitch * 8;
+ write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d20 = vld1_u8(us);
+
+ vs = v - 4;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ ud = u - 2;
+ write_4x8(ud, pitch, q4ResultL);
+
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+ vd = v - 2;
+ write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
new file mode 100644
index 000000000..b25686ffb
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ uint8_t *sp;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
+ int16x8_t q2s16, q3s16, q13s16;
+ int8x8_t d8s8, d9s8;
+ int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ sp = s - (p << 1);
+ q5u8 = vld1q_u8(sp);
+ sp += p;
+ q6u8 = vld1q_u8(sp);
+ sp += p;
+ q7u8 = vld1q_u8(sp);
+ sp += p;
+ q8u8 = vld1q_u8(sp);
+
+ q15u8 = vabdq_u8(q6u8, q7u8);
+ q14u8 = vabdq_u8(q5u8, q8u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q13s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+ q7u8 = veorq_u8(q7u8, q0u8);
+ q8u8 = veorq_u8(q8u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q6u8)));
+ q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q6u8)));
+
+ q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
+ vreinterpretq_s8_u8(q8u8));
+
+ q2s16 = vmulq_s16(q2s16, q13s16);
+ q3s16 = vmulq_s16(q3s16, q13s16);
+
+ q10u8 = vdupq_n_u8(3);
+ q9u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
+ q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
+
+ d8s8 = vqmovn_s16(q2s16);
+ d9s8 = vqmovn_s16(q3s16);
+ q4s8 = vcombine_s8(d8s8, d9s8);
+
+ q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q3s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ vst1q_u8(s, q7u8);
+ s -= p;
+ vst1q_u8(s, q6u8);
+ return;
+}
+
+void vp8_loop_filter_bhs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbhs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 000000000..e1c8609e3
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+ const uint8x8x2_t result) {
+ /*
+ * uint8x8x2_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ ---
+ * after vtrn_u8
+ 00 10 02 12 | 04 14 06 16
+ 01 11 03 13 | 05 15 07 17
+ */
+ const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+ result.val[1]);
+ const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+ const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ write_2x4(dst, pitch, result);
+ dst += pitch * 8;
+ write_2x4(dst, pitch, result2);
+}
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ vst2_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 7);
+ dst += pitch;
+
+ vst2_lane_u8(dst, result2, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 7);
+}
+#endif // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ const uint8x8_t a = vld1_u8(src);
+ const uint8x8_t b = vld1_u8(src + pitch * 1);
+ const uint8x8_t c = vld1_u8(src + pitch * 2);
+ const uint8x8_t d = vld1_u8(src + pitch * 3);
+ const uint8x8_t e = vld1_u8(src + pitch * 4);
+ const uint8x8_t f = vld1_u8(src + pitch * 5);
+ const uint8x8_t g = vld1_u8(src + pitch * 6);
+ const uint8x8_t h = vld1_u8(src + pitch * 7);
+ const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+ vreinterpret_u32_u8(e));
+ const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+ vreinterpret_u32_u8(f));
+ const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+ vreinterpret_u32_u8(g));
+ const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+ vreinterpret_u32_u8(h));
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+ vreinterpret_u16_u32(r26_u32.val[0]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+ vreinterpret_u16_u32(r37_u32.val[0]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ /*
+ * after vtrn_u32
+ 00 01 02 03 | 40 41 42 43
+ 10 11 12 13 | 50 51 52 53
+ 20 21 22 23 | 60 61 62 63
+ 30 31 32 33 | 70 71 72 73
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 40 41 60 61
+ 02 03 22 23 | 42 43 62 63
+ 10 11 30 31 | 50 51 70 71
+ 12 13 32 33 | 52 52 72 73
+
+ 00 01 20 21 | 40 41 60 61
+ 10 11 30 31 | 50 51 70 71
+ 02 03 22 23 | 42 43 62 63
+ 12 13 32 33 | 52 52 72 73
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 40 50 60 70
+ 01 11 21 31 | 41 51 61 71
+ 02 12 22 32 | 42 52 62 72
+ 03 13 23 33 | 43 53 63 73
+ */
+ x.val[0] = r01_u8.val[0];
+ x.val[1] = r01_u8.val[1];
+ x.val[2] = r23_u8.val[0];
+ x.val[3] = r23_u8.val[1];
+
+ return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ x = vld4_lane_u8(src, x, 0);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 1);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 2);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 3);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 4);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 5);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 6);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 7);
+ return x;
+}
+#endif // VPX_INCOMPATIBLE_GCC
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ unsigned char *src1;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+ int16x8_t q2s16, q13s16, q11s16;
+ int8x8_t d28s8, d29s8;
+ int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+ uint8x8x4_t d0u8x4; // d6, d7, d8, d9
+ uint8x8x4_t d1u8x4; // d10, d11, d12, d13
+ uint8x8x2_t d2u8x2; // d12, d13
+ uint8x8x2_t d3u8x2; // d14, d15
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ src1 = s - 2;
+ d0u8x4 = read_4x8(src1, p, d0u8x4);
+ src1 += p * 8;
+ d1u8x4 = read_4x8(src1, p, d1u8x4);
+
+ q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
+ q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
+ q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
+ q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
+
+ q15u8 = vabdq_u8(q5u8, q4u8);
+ q14u8 = vabdq_u8(q3u8, q6u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q11s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q3u8 = veorq_u8(q3u8, q0u8);
+ q4u8 = veorq_u8(q4u8, q0u8);
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+ q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+ vreinterpretq_s8_u8(q6u8));
+
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q11u8 = vdupq_n_u8(3);
+ q12u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+ d28s8 = vqmovn_s16(q2s16);
+ d29s8 = vqmovn_s16(q13s16);
+ q14s8 = vcombine_s8(d28s8, d29s8);
+
+ q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q14s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ d2u8x2.val[0] = vget_low_u8(q6u8); // d12
+ d2u8x2.val[1] = vget_low_u8(q7u8); // d14
+ d3u8x2.val[0] = vget_high_u8(q6u8); // d13
+ d3u8x2.val[1] = vget_high_u8(q7u8); // d15
+
+ src1 = s - 1;
+ write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 000000000..5351f4be6
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_mbloop_filter_neon(
+ uint8x16_t qblimit, // mblimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p2
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q4r, // p1
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r, // q1
+ uint8x16_t *q9r) { // q1
+ uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+ uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+ int8x16_t q0s8, q12s8, q14s8, q15s8;
+ int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q1u8 = vabdq_u8(q9, q8);
+ q0u8 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q1u8 = vmaxq_u8(q1u8, q0u8);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q12u8 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q1u8);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ q1u8 = vabdq_u8(q5, q8);
+ q12u8 = vqaddq_u8(q12u8, q12u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q0u8 = vdupq_n_u8(0x80);
+ q9 = veorq_u8(q9, q0u8);
+ q8 = veorq_u8(q8, q0u8);
+ q7 = veorq_u8(q7, q0u8);
+ q6 = veorq_u8(q6, q0u8);
+ q5 = veorq_u8(q5, q0u8);
+ q4 = veorq_u8(q4, q0u8);
+
+ q1u8 = vshrq_n_u8(q1u8, 1);
+ q12u8 = vqaddq_u8(q12u8, q1u8);
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+ q12u8 = vcgeq_u8(qblimit, q12u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q11s16 = vdupq_n_s16(3);
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q15u8 = vandq_u8(q15u8, q12u8);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+ q12u8 = vdupq_n_u8(3);
+ q11u8 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2 = vqmovn_s16(q2s16);
+ d3 = vqmovn_s16(q13s16);
+ q1s8 = vcombine_s8(d2, d3);
+ q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+ q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+ q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q13s8 = vshrq_n_s8(q13s8, 3);
+
+ q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+ q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+ d5 = vdup_n_s8(9);
+ d4 = vdup_n_s8(18);
+
+ q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
+ q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+ d5 = vdup_n_s8(27);
+ q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
+ q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+ q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
+ q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+ d0 = vqshrn_n_s16(q0s16 , 7);
+ d1 = vqshrn_n_s16(q11s16, 7);
+ d24 = vqshrn_n_s16(q12s16, 7);
+ d25 = vqshrn_n_s16(q13s16, 7);
+ d28 = vqshrn_n_s16(q14s16, 7);
+ d29 = vqshrn_n_s16(q15s16, 7);
+
+ q0s8 = vcombine_s8(d0, d1);
+ q12s8 = vcombine_s8(d24, d25);
+ q14s8 = vcombine_s8(d28, d29);
+
+ q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+ q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+ q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+ q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+ q15s8 = vqsubq_s8((q7s8), q14s8);
+ q14s8 = vqaddq_s8((q6s8), q14s8);
+
+ q1u8 = vdupq_n_u8(0x80);
+ *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+ *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ src -= (pitch * 6);
+ vst1q_u8(src, q4);
+ src += pitch;
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ src += pitch;
+ vst1q_u8(src, q9);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ u -= (pitch * 6);
+ v -= (pitch * 6);
+ vst1_u8(u, vget_low_u8(q4));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q4));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q9));
+ vst1_u8(v, vget_high_u8(q9));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s1, *s2;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s1 = src - 4;
+ s2 = s1 + 8 * pitch;
+ d6 = vld1_u8(s1);
+ s1 += pitch;
+ d7 = vld1_u8(s2);
+ s2 += pitch;
+ d8 = vld1_u8(s1);
+ s1 += pitch;
+ d9 = vld1_u8(s2);
+ s2 += pitch;
+ d10 = vld1_u8(s1);
+ s1 += pitch;
+ d11 = vld1_u8(s2);
+ s2 += pitch;
+ d12 = vld1_u8(s1);
+ s1 += pitch;
+ d13 = vld1_u8(s2);
+ s2 += pitch;
+ d14 = vld1_u8(s1);
+ s1 += pitch;
+ d15 = vld1_u8(s2);
+ s2 += pitch;
+ d16 = vld1_u8(s1);
+ s1 += pitch;
+ d17 = vld1_u8(s2);
+ s2 += pitch;
+ d18 = vld1_u8(s1);
+ s1 += pitch;
+ d19 = vld1_u8(s2);
+ s2 += pitch;
+ d20 = vld1_u8(s1);
+ d21 = vld1_u8(s2);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ s1 -= 7 * pitch;
+ s2 -= 7 * pitch;
+
+ vst1_u8(s1, vget_low_u8(q3));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q3));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q4));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q4));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q5));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q5));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q6));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q6));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q7));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q7));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q8));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q8));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q9));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q9));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q10));
+ vst1_u8(s2, vget_high_u8(q10));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ vs = v - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d20 = vld1_u8(us);
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ ud = u - 4;
+ vst1_u8(ud, vget_low_u8(q3));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q4));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q5));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q6));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q7));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q8));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q9));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q10));
+
+ vd = v - 4;
+ vst1_u8(vd, vget_high_u8(q3));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q4));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q5));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q6));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q7));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q8));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q9));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q10));
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/reconintra_neon.c b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c
new file mode 100644
index 000000000..af52cd5ea
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride) {
+ const int mode = x->mode_info_context->mbmi.mode;
+ int i;
+
+ switch (mode) {
+ case DC_PRED:
+ {
+ int shift = x->up_available + x->left_available;
+ uint8x16_t v_expected_dc = vdupq_n_u8(128);
+
+ if (shift) {
+ unsigned int average = 0;
+ int expected_dc;
+ if (x->up_available) {
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ const uint16x8_t a = vpaddlq_u8(v_above);
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ average = vget_lane_u32(d, 0);
+ }
+ if (x->left_available) {
+ for (i = 0; i < 16; ++i) {
+ average += yleft[0];
+ yleft += left_stride;
+ }
+ }
+ shift += 3;
+ expected_dc = (average + (1 << (shift - 1))) >> shift;
+ v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
+ }
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(ypred_ptr, v_expected_dc);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(ypred_ptr, v_above);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
+ yleft += left_stride;
+ vst1q_u8(ypred_ptr, v_yleft);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case TM_PRED:
+ {
+ const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ for (i = 0; i < 16; ++i) {
+ const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
+ const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
+ const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
+ const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
+ vreinterpretq_s16_u16(v_ytop_left));
+ const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
+ vreinterpretq_s16_u16(v_ytop_left));
+ const uint8x8_t pred_lo = vqmovun_s16(b_lo);
+ const uint8x8_t pred_hi = vqmovun_s16(b_hi);
+
+ vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
+ ypred_ptr += y_stride;
+ yleft += left_stride;
+ }
+ }
+ break;
+ }
+}
+
+void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride) {
+ const int mode = x->mode_info_context->mbmi.uv_mode;
+ int i;
+
+ switch (mode) {
+ case DC_PRED:
+ {
+ int shift = x->up_available + x->left_available;
+ uint8x8_t v_expected_udc = vdup_n_u8(128);
+ uint8x8_t v_expected_vdc = vdup_n_u8(128);
+
+ if (shift) {
+ unsigned int average_u = 0;
+ unsigned int average_v = 0;
+ int expected_udc;
+ int expected_vdc;
+ if (x->up_available) {
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
+ average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
+ }
+ if (x->left_available) {
+ for (i = 0; i < 8; ++i) {
+ average_u += uleft[0];
+ uleft += left_stride;
+ average_v += vleft[0];
+ vleft += left_stride;
+ }
+ }
+ shift += 2;
+ expected_udc = (average_u + (1 << (shift - 1))) >> shift;
+ expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
+ v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
+ v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
+ }
+ for (i = 0; i < 8; ++i) {
+ vst1_u8(upred_ptr, v_expected_udc);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_expected_vdc);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ for (i = 0; i < 8; ++i) {
+ vst1_u8(upred_ptr, v_uabove);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_vabove);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
+ const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
+ uleft += left_stride;
+ vleft += left_stride;
+ vst1_u8(upred_ptr, v_uleft);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_vleft);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case TM_PRED:
+ {
+ const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
+ const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
+ const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
+ const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
+ const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
+ const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
+ vreinterpretq_s16_u16(v_utop_left));
+ const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
+ vreinterpretq_s16_u16(v_vtop_left));
+ const uint8x8_t pred_u = vqmovun_s16(b_u);
+ const uint8x8_t pred_v = vqmovun_s16(b_v);
+
+ vst1_u8(upred_ptr, pred_u);
+ vst1_u8(vpred_ptr, pred_v);
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ uleft += left_stride;
+ vleft += left_stride;
+ }
+ }
+ break;
+ }
+}
diff --git a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
new file mode 100644
index 000000000..373afa6ed
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 35468;
+
+void vp8_short_idct4x4llm_neon(
+ int16_t *input,
+ unsigned char *pred_ptr,
+ int pred_stride,
+ unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint32x2_t d6u32 = vdup_n_u32(0);
+ uint8x8_t d1u8;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ uint16x8_t q1u16;
+ int16x8_t q1s16, q2s16, q3s16, q4s16;
+ int32x2x2_t v2tmp0, v2tmp1;
+ int16x4x2_t v2tmp2, v2tmp3;
+
+ d2 = vld1_s16(input);
+ d3 = vld1_s16(input + 4);
+ d4 = vld1_s16(input + 8);
+ d5 = vld1_s16(input + 12);
+
+ // 1st for loop
+ q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
+ q2s16 = vcombine_s16(d3, d5);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q3s16 = vshrq_n_s16(q3s16, 1);
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q3s16 = vqaddq_s16(q3s16, q2s16);
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ // 2nd for loop
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+ q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q3s16 = vshrq_n_s16(q3s16, 1);
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q3s16 = vqaddq_s16(q3s16, q2s16);
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+ q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+
+ // dc_only_idct_add
+ for (i = 0; i < 2; i++, q1s16 = q2s16) {
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+ pred_ptr += pred_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
+ vreinterpret_u8_u32(d6u32));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+ dst_ptr += dst_stride;
+ }
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
new file mode 100644
index 000000000..4c2efc92b
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1754 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+
+static const int8_t vp8_sub_pel_filters[8][8] = {
+ {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
+ {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
+ {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
+ {0, -9, 93, 50, -6, 0, 0, 0},
+ {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
+ {0, -6, 50, 93, -9, 0, 0, 0},
+ {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
+ {0, -1, 12, 123, -6, 0, 0, 0},
+};
+
+void vp8_sixtap_predict4x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
+ uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
+ uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
+ uint32x2x2_t d0u32x2, d1u32x2;
+
+ if (xoffset == 0) { // secondpass_filter4x4_only
+ uint32x2_t d27u32 = vdup_n_u32(0);
+ uint32x2_t d28u32 = vdup_n_u32(0);
+ uint32x2_t d29u32 = vdup_n_u32(0);
+ uint32x2_t d30u32 = vdup_n_u32(0);
+ uint32x2_t d31u32 = vdup_n_u32(0);
+
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
+ src += src_pixels_per_line;
+ d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
+ src += src_pixels_per_line;
+ d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
+ src += src_pixels_per_line;
+ d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
+ src += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
+ src += src_pixels_per_line;
+ d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
+ src += src_pixels_per_line;
+ d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
+ src += src_pixels_per_line;
+ d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
+ src += src_pixels_per_line;
+ d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
+
+ d27u8 = vreinterpret_u8_u32(d27u32);
+ d28u8 = vreinterpret_u8_u32(d28u32);
+ d29u8 = vreinterpret_u8_u32(d29u32);
+ d30u8 = vreinterpret_u8_u32(d30u32);
+ d31u8 = vreinterpret_u8_u32(d31u32);
+
+ d23u8 = vext_u8(d27u8, d28u8, 4);
+ d24u8 = vext_u8(d28u8, d29u8, 4);
+ d25u8 = vext_u8(d29u8, d30u8, 4);
+ d26u8 = vext_u8(d30u8, d31u8, 4);
+
+ q3u16 = vmull_u8(d27u8, d0u8);
+ q4u16 = vmull_u8(d28u8, d0u8);
+ q5u16 = vmull_u8(d25u8, d5u8);
+ q6u16 = vmull_u8(d26u8, d5u8);
+
+ q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+ q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+ q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+
+ q5s16 = vqaddq_s16(q5s16, q3s16);
+ q6s16 = vqaddq_s16(q6s16, q4s16);
+
+ d3u8 = vqrshrun_n_s16(q5s16, 7);
+ d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+
+ d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ // vswp here
+ q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+ q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
+ vreinterpret_u32_u8(d19u8));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
+ vreinterpret_u32_u8(d21u8));
+ q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+ q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+
+ // keep original src data in q4 q6
+ q4u64 = vreinterpretq_u64_u8(q3u8);
+ q6u64 = vreinterpretq_u64_u8(q5u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
+ vreinterpret_u32_u8(vget_high_u8(q3u8)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
+ vreinterpret_u32_u8(vget_high_u8(q5u8)));
+ q9u64 = vshrq_n_u64(q4u64, 8);
+ q10u64 = vshrq_n_u64(q6u64, 8);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 32);
+ q5u64 = vshrq_n_u64(q6u64, 32);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ q9u64 = vshrq_n_u64(q4u64, 16);
+ q10u64 = vshrq_n_u64(q6u64, 16);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 24);
+ q5u64 = vshrq_n_u64(q6u64, 24);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+ q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q7s16 = vqaddq_s16(q7s16, q9s16);
+ q8s16 = vqaddq_s16(q8s16, q10s16);
+
+ d27u8 = vqrshrun_n_s16(q7s16, 7);
+ d28u8 = vqrshrun_n_s16(q8s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+ return;
+ }
+
+ // First Pass on rest 5-line data
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q11u8 = vld1q_u8(src);
+
+ d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ // vswp here
+ q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+ q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
+ vreinterpret_u32_u8(d19u8));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
+ vreinterpret_u32_u8(d21u8));
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
+ q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+ q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+ q12u16 = vmull_u8(d31u8, d5u8);
+
+ q4u64 = vreinterpretq_u64_u8(q3u8);
+ q6u64 = vreinterpretq_u64_u8(q5u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
+ vreinterpret_u32_u8(vget_high_u8(q3u8)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
+ vreinterpret_u32_u8(vget_high_u8(q5u8)));
+ q9u64 = vshrq_n_u64(q4u64, 8);
+ q10u64 = vshrq_n_u64(q6u64, 8);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+ q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 32);
+ q5u64 = vshrq_n_u64(q6u64, 32);
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ q9u64 = vshrq_n_u64(q4u64, 16);
+ q10u64 = vshrq_n_u64(q6u64, 16);
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
+ q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+ q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
+ vreinterpret_u32_u64(vget_high_u64(q9u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
+ vreinterpret_u32_u64(vget_high_u64(q10u64)));
+ q3u64 = vshrq_n_u64(q4u64, 24);
+ q5u64 = vshrq_n_u64(q6u64, 24);
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
+ q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+ q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
+ vreinterpret_u32_u64(vget_high_u64(q3u64)));
+ d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
+ vreinterpret_u32_u64(vget_high_u64(q5u64)));
+ d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
+ q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+ q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+ q11u16 = vmull_u8(d31u8, d3u8);
+
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q7s16 = vqaddq_s16(q7s16, q9s16);
+ q8s16 = vqaddq_s16(q8s16, q10s16);
+ q12s16 = vqaddq_s16(q12s16, q11s16);
+
+ d29u8 = vqrshrun_n_s16(q7s16, 7);
+ d30u8 = vqrshrun_n_s16(q8s16, 7);
+ d31u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 4x4
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ d23u8 = vext_u8(d27u8, d28u8, 4);
+ d24u8 = vext_u8(d28u8, d29u8, 4);
+ d25u8 = vext_u8(d29u8, d30u8, 4);
+ d26u8 = vext_u8(d30u8, d31u8, 4);
+
+ q3u16 = vmull_u8(d27u8, d0u8);
+ q4u16 = vmull_u8(d28u8, d0u8);
+ q5u16 = vmull_u8(d25u8, d5u8);
+ q6u16 = vmull_u8(d26u8, d5u8);
+
+ q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+ q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+ q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+
+ q5s16 = vqaddq_s16(q5s16, q3s16);
+ q6s16 = vqaddq_s16(q6s16, q4s16);
+
+ d3u8 = vqrshrun_n_s16(q5s16, 7);
+ d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+ dst_ptr += dst_pitch;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+ return;
+}
+
+void vp8_sixtap_predict8x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
+
+ if (xoffset == 0) { // secondpass_filter8x4_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ return;
+ }
+
+ // First Pass on rest 5-line data
+ src += src_pixels_per_line;
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x4
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+}
+
+void vp8_sixtap_predict8x8_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *tmpp;
+ unsigned char tmp[64];
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
+ uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+ tmpp = tmp;
+ for (i = 2; i > 0; i--) {
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ dst_ptr += dst_pitch;
+ } else {
+ vst1_u8(tmpp, d22u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d23u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d24u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d25u8);
+ tmpp += 8;
+ }
+ }
+ if (yoffset == 0)
+ return;
+
+ // First Pass on rest 5-line data
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x8
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ tmpp = tmp;
+ q9u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q10u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q12u8 = vld1q_u8(tmpp);
+
+ d18u8 = vget_low_u8(q9u8);
+ d19u8 = vget_high_u8(q9u8);
+ d20u8 = vget_low_u8(q10u8);
+ d21u8 = vget_high_u8(q10u8);
+ d22u8 = vget_low_u8(q11u8);
+ d23u8 = vget_high_u8(q11u8);
+ d24u8 = vget_low_u8(q12u8);
+ d25u8 = vget_high_u8(q12u8);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+}
+
+void vp8_sixtap_predict16x16_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *src_tmp, *dst, *tmpp;
+ unsigned char tmp[336];
+ int i, j;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
+ uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
+ uint8x8_t d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint8x16_t q3u8, q4u8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
+ uint16x8_t q11u16, q12u16, q13u16, q15u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
+ int16x8_t q11s16, q12s16, q13s16, q15s16;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src_tmp = src_ptr - src_pixels_per_line * 2;
+ for (i = 0; i < 2; i++) {
+ src = src_tmp + i * 8;
+ dst = dst_ptr + i * 8;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ for (j = 0; j < 4; j++) {
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ dst = dst_ptr;
+ for (i = 0; i < 8; i++) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+
+ q6u16 = vmull_u8(d6u8, d0u8);
+ q7u16 = vmull_u8(d7u8, d0u8);
+ q8u16 = vmull_u8(d9u8, d0u8);
+ q9u16 = vmull_u8(d10u8, d0u8);
+
+ d20u8 = vext_u8(d6u8, d7u8, 1);
+ d21u8 = vext_u8(d9u8, d10u8, 1);
+ d22u8 = vext_u8(d7u8, d8u8, 1);
+ d23u8 = vext_u8(d10u8, d11u8, 1);
+ d24u8 = vext_u8(d6u8, d7u8, 4);
+ d25u8 = vext_u8(d9u8, d10u8, 4);
+ d26u8 = vext_u8(d7u8, d8u8, 4);
+ d27u8 = vext_u8(d10u8, d11u8, 4);
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+
+ q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
+ q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
+ q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+
+ d20u8 = vext_u8(d7u8, d8u8, 5);
+ d21u8 = vext_u8(d10u8, d11u8, 5);
+ d22u8 = vext_u8(d6u8, d7u8, 2);
+ d23u8 = vext_u8(d9u8, d10u8, 2);
+ d24u8 = vext_u8(d7u8, d8u8, 2);
+ d25u8 = vext_u8(d10u8, d11u8, 2);
+ d26u8 = vext_u8(d6u8, d7u8, 3);
+ d27u8 = vext_u8(d9u8, d10u8, 3);
+ d28u8 = vext_u8(d7u8, d8u8, 3);
+ d29u8 = vext_u8(d10u8, d11u8, 3);
+
+ q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
+ q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
+
+ q10u16 = vmull_u8(d26u8, d3u8);
+ q11u16 = vmull_u8(d27u8, d3u8);
+ q12u16 = vmull_u8(d28u8, d3u8);
+ q15u16 = vmull_u8(d29u8, d3u8);
+
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q15s16 = vreinterpretq_s16_u16(q15u16);
+
+ q6s16 = vqaddq_s16(q6s16, q10s16);
+ q8s16 = vqaddq_s16(q8s16, q11s16);
+ q7s16 = vqaddq_s16(q7s16, q12s16);
+ q9s16 = vqaddq_s16(q9s16, q15s16);
+
+ d6u8 = vqrshrun_n_s16(q6s16, 7);
+ d7u8 = vqrshrun_n_s16(q7s16, 7);
+ d8u8 = vqrshrun_n_s16(q8s16, 7);
+ d9u8 = vqrshrun_n_s16(q9s16, 7);
+
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+ vst1q_u8(dst, q3u8);
+ dst += dst_pitch;
+ vst1q_u8(dst, q4u8);
+ dst += dst_pitch;
+ }
+ return;
+ }
+
+ src = src_ptr - 2 - src_pixels_per_line * 2;
+ tmpp = tmp;
+ for (i = 0; i < 7; i++) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d12u8 = vld1_u8(src);
+ d13u8 = vld1_u8(src + 8);
+ d14u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q8u16 = vmull_u8(d6u8, d0u8);
+ q9u16 = vmull_u8(d7u8, d0u8);
+ q10u16 = vmull_u8(d9u8, d0u8);
+ q11u16 = vmull_u8(d10u8, d0u8);
+ q12u16 = vmull_u8(d12u8, d0u8);
+ q13u16 = vmull_u8(d13u8, d0u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 1);
+ d29u8 = vext_u8(d9u8, d10u8, 1);
+ d30u8 = vext_u8(d12u8, d13u8, 1);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
+ d28u8 = vext_u8(d7u8, d8u8, 1);
+ d29u8 = vext_u8(d10u8, d11u8, 1);
+ d30u8 = vext_u8(d13u8, d14u8, 1);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 4);
+ d29u8 = vext_u8(d9u8, d10u8, 4);
+ d30u8 = vext_u8(d12u8, d13u8, 4);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
+ d28u8 = vext_u8(d7u8, d8u8, 4);
+ d29u8 = vext_u8(d10u8, d11u8, 4);
+ d30u8 = vext_u8(d13u8, d14u8, 4);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+ d30u8 = vext_u8(d12u8, d13u8, 5);
+ q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
+ d28u8 = vext_u8(d7u8, d8u8, 5);
+ d29u8 = vext_u8(d10u8, d11u8, 5);
+ d30u8 = vext_u8(d13u8, d14u8, 5);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 2);
+ d29u8 = vext_u8(d9u8, d10u8, 2);
+ d30u8 = vext_u8(d12u8, d13u8, 2);
+ q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
+ d28u8 = vext_u8(d7u8, d8u8, 2);
+ d29u8 = vext_u8(d10u8, d11u8, 2);
+ d30u8 = vext_u8(d13u8, d14u8, 2);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 3);
+ d29u8 = vext_u8(d9u8, d10u8, 3);
+ d30u8 = vext_u8(d12u8, d13u8, 3);
+ d15u8 = vext_u8(d7u8, d8u8, 3);
+ d31u8 = vext_u8(d10u8, d11u8, 3);
+ d6u8 = vext_u8(d13u8, d14u8, 3);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q12s16 = vqaddq_s16(q12s16, q6s16);
+
+ q6u16 = vmull_u8(d15u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+ q3u16 = vmull_u8(d6u8, d3u8);
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q13s16 = vreinterpretq_s16_u16(q13u16);
+ q9s16 = vqaddq_s16(q9s16, q6s16);
+ q11s16 = vqaddq_s16(q11s16, q7s16);
+ q13s16 = vqaddq_s16(q13s16, q3s16);
+
+ d6u8 = vqrshrun_n_s16(q8s16, 7);
+ d7u8 = vqrshrun_n_s16(q9s16, 7);
+ d8u8 = vqrshrun_n_s16(q10s16, 7);
+ d9u8 = vqrshrun_n_s16(q11s16, 7);
+ d10u8 = vqrshrun_n_s16(q12s16, 7);
+ d11u8 = vqrshrun_n_s16(q13s16, 7);
+
+ vst1_u8(tmpp, d6u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d7u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d8u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d9u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d10u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d11u8);
+ tmpp += 8;
+ }
+
+ // Second pass: 16x16
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ for (i = 0; i < 2; i++) {
+ dst = dst_ptr + 8 * i;
+ tmpp = tmp + 8 * i;
+ d18u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d19u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d20u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d21u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d22u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ for (j = 0; j < 4; j++) {
+ d23u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d24u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d25u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d26u8 = vld1_u8(tmpp);
+ tmpp += 16;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+}
diff --git a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
new file mode 100644
index 000000000..974d3b653
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -0,0 +1,1017 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+static const uint16_t bilinear_taps_coeff[8][2] = {
+ {128, 0},
+ {112, 16},
+ { 96, 32},
+ { 80, 48},
+ { 64, 64},
+ { 48, 80},
+ { 32, 96},
+ { 16, 112}
+};
+
+unsigned int vp8_sub_pixel_variance16x16_neon_func(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int i;
+ DECLARE_ALIGNED(16, unsigned char, tmp[528]);
+ unsigned char *tmpp;
+ unsigned char *tmpp2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+ uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ tmpp2 = tmp + 272;
+ tmpp = tmp;
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __builtin_prefetch(src_ptr);
+ __builtin_prefetch(src_ptr + src_pixels_per_line);
+ __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)tmpp2, q1u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q2u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q3u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q4u8);
+ tmpp2 += 16;
+ }
+ } else if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+ for (i = 4; i > 0 ; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ __builtin_prefetch(src_ptr);
+ __builtin_prefetch(src_ptr + src_pixels_per_line);
+ __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp2, q7u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q8u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q9u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q10u8);
+ tmpp2 += 16;
+ }
+ } else {
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8);
+ tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ tmpp2 = tmpp + 272;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q13u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q14u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q15u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)tmpp2, q1u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q2u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q3u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q4u8);
+ tmpp2 += 16;
+ }
+ }
+
+ // sub_pixel_variance16x16_neon
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ tmpp = tmp + 272;
+ for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop
+ q0u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q1u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q2u8 = vld1q_u8(dst_ptr);
+ dst_ptr += dst_pixels_per_line;
+ q3u8 = vld1q_u8(dst_ptr);
+ dst_ptr += dst_pixels_per_line;
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+
+ q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_h_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
+ uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
+ uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q0u8 = vld1q_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q2u8 = vld1q_u8(src_ptr);
+ q3u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ q5u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ q7u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+
+ q11u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q12u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q13u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q14u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q1u8 = vextq_u8(q0u8, q1u8, 1);
+ q3u8 = vextq_u8(q2u8, q3u8, 1);
+ q5u8 = vextq_u8(q4u8, q5u8, 1);
+ q7u8 = vextq_u8(q6u8, q7u8, 1);
+
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ q1u8 = vrhaddq_u8(q2u8, q3u8);
+ q2u8 = vrhaddq_u8(q4u8, q5u8);
+ q3u8 = vrhaddq_u8(q6u8, q7u8);
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d6u8 = vget_low_u8(q3u8);
+ d7u8 = vget_high_u8(q3u8);
+
+ q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
+ q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
+ q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
+ q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
+ q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
+ q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
+ q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
+ q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
+
+ d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
+ d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
+ q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
+ q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
+ d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+ d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
+ q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
+ q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
+ d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+ d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
+ q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
+ q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
+ d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
+ d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
+ q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
+ q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+ q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+ q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+ q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+ q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+ d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+ d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+ q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+ q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+ d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+ d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+ q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_v_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
+ uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q2u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+
+ q1u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q5u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q7u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q2u8 = vrhaddq_u8(q2u8, q4u8);
+ q4u8 = vrhaddq_u8(q4u8, q6u8);
+ q6u8 = vrhaddq_u8(q6u8, q15u8);
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d8u8 = vget_low_u8(q4u8);
+ d9u8 = vget_high_u8(q4u8);
+ d12u8 = vget_low_u8(q6u8);
+ d13u8 = vget_high_u8(q6u8);
+
+ q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
+ q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
+ q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
+ q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8));
+ q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8));
+ q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8));
+ q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+ q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+ q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+ q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+ q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+ d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+ d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+ q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+ q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+ d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+ d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+ q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+
+ q0u8 = q15u8;
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_hv_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
+ int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+ uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
+ int32x4_t q13s32, q14s32, q15s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q13s32 = vdupq_n_s32(0);
+ q14s32 = vdupq_n_s32(0);
+ q15s32 = vdupq_n_s32(0);
+
+ q0u8 = vld1q_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q1u8 = vextq_u8(q0u8, q1u8, 1);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q2u8 = vld1q_u8(src_ptr);
+ q3u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ q5u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ q7u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q8u8 = vld1q_u8(src_ptr);
+ q9u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+
+ q3u8 = vextq_u8(q2u8, q3u8, 1);
+ q5u8 = vextq_u8(q4u8, q5u8, 1);
+ q7u8 = vextq_u8(q6u8, q7u8, 1);
+ q9u8 = vextq_u8(q8u8, q9u8, 1);
+
+ q1u8 = vrhaddq_u8(q2u8, q3u8);
+ q2u8 = vrhaddq_u8(q4u8, q5u8);
+ q3u8 = vrhaddq_u8(q6u8, q7u8);
+ q4u8 = vrhaddq_u8(q8u8, q9u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ q1u8 = vrhaddq_u8(q1u8, q2u8);
+ q2u8 = vrhaddq_u8(q2u8, q3u8);
+ q3u8 = vrhaddq_u8(q3u8, q4u8);
+
+ q5u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q6u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q7u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q8u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d6u8 = vget_low_u8(q3u8);
+ d7u8 = vget_high_u8(q3u8);
+
+ q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8));
+ q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
+ q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
+ q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
+ q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8));
+ q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8));
+ q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8));
+ q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
+ q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
+ q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
+ q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
+ q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
+ q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
+ q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
+ q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
+
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
+ q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
+ q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
+
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
+ q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
+
+ d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+ d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
+ q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
+ q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
+
+ d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+ d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
+ q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
+ q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
+
+ q0u8 = q4u8;
+ }
+
+ q15s32 = vaddq_s32(q14s32, q15s32);
+ q0s64 = vpaddlq_s32(q13s32);
+ q1s64 = vpaddlq_s32(q15s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+#define FILTER_BITS 7
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (8 * 8));
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint16_t *vpx_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+ const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(&output_ptr[0], out);
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp8_sub_pixel_variance8x8_neon(
+ const unsigned char *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+ if (xoffset == 0) {
+ var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,
+ 8, bilinear_taps_coeff[yoffset]);
+ } else if (yoffset == 0) {
+ var_filter_block2d_bil_w8(src, temp2, src_stride, 1,
+ 9, 8,
+ bilinear_taps_coeff[xoffset]);
+ } else {
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+ 9, 8,
+ bilinear_taps_coeff[xoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+ 8, bilinear_taps_coeff[yoffset]);
+ }
+ return variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}