summaryrefslogtreecommitdiffstats
path: root/media/libvpx/vp8/encoder/arm/neon
diff options
context:
space:
mode:
authorMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
committerMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
commit5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree10027f336435511475e392454359edea8e25895d /media/libvpx/vp8/encoder/arm/neon
parent49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
downloadUXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
Add m-esr52 at 52.6.0
Diffstat (limited to 'media/libvpx/vp8/encoder/arm/neon')
-rw-r--r--media/libvpx/vp8/encoder/arm/neon/denoising_neon.c478
-rw-r--r--media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c89
-rw-r--r--media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c269
-rw-r--r--media/libvpx/vp8/encoder/arm/neon/subtract_neon.c154
-rw-r--r--media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c129
5 files changed, 1119 insertions, 0 deletions
diff --git a/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c
new file mode 100644
index 000000000..08be76e43
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/encoder/denoising.h"
+#include "vpx_mem/vpx_mem.h"
+#include "./vp8_rtcd.h"
+
+/*
+ * The filter function was modified to reduce the computational complexity.
+ *
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ * adjustment = filtered_value - current_raw
+ * = (filter_coefficient * diff + 128) >> 8
+ * where
+ * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3));
+ * filter_coefficient += filter_coefficient /
+ * (3 + motion_magnitude_adjustment);
+ * filter_coefficient is clamped to 0 ~ 255.
+ *
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff level adjustment w/o adjustment w/
+ * motion correction motion correction
+ * [-255, -16] 3 -6 -7
+ * [-15, -8] 2 -4 -5
+ * [-7, -4] 1 -3 -4
+ * [-3, 3] 0 diff diff
+ * [4, 7] 1 3 4
+ * [8, 15] 2 4 5
+ * [16, 255] 3 6 7
+ */
+
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
+ int mc_running_avg_y_stride,
+ unsigned char *running_avg_y,
+ int running_avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising) {
+ /* If motion_magnitude is small, making the denoiser more aggressive by
+ * increasing the adjustment for each level, level1 adjustment is
+ * increased, the deltas stay the same.
+ */
+ int shift_inc = (increase_denoising &&
+ motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+ int64x2_t v_sum_diff_total = vdupq_n_s64(0);
+
+ /* Go over lines. */
+ int r;
+ for (r = 0; r < 16; ++r) {
+ /* Load inputs. */
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
+ v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
+ v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
+ v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
+ v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
+ v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
+ v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
+ v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
+ v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+ v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+ v_abs_adjustment);
+
+ uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+
+ const int32x4_t fedc_ba98_7654_3210 =
+ vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+
+ const int64x2_t fedcba98_76543210 =
+ vpaddlq_s32(fedc_ba98_7654_3210);
+
+ v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+ }
+
+ /* Update pointers for next iteration. */
+ sig += sig_stride;
+ mc_running_avg_y += mc_running_avg_y_stride;
+ running_avg_y += running_avg_y_stride;
+ }
+
+ /* Too much adjustments => copy block. */
+ {
+ int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ vget_low_s64(v_sum_diff_total));
+ int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+ int sum_diff_thresh = SUM_DIFF_THRESHOLD;
+
+ if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+ if (sum_diff > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // checK if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the accceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ sig -= sig_stride * 16;
+ mc_running_avg_y -= mc_running_avg_y_stride * 16;
+ running_avg_y -= running_avg_y_stride * 16;
+ for (r = 0; r < 16; ++r) {
+ uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
+ v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
+ v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
+ v_mc_running_avg_y);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment =
+ vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+ v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+ v_abs_adjustment);
+
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+ v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+
+ const int16x8_t fe_dc_ba_98_76_54_32_10 =
+ vpaddlq_s8(v_sum_diff);
+ const int32x4_t fedc_ba98_7654_3210 =
+ vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 =
+ vpaddlq_s32(fedc_ba98_7654_3210);
+
+ v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
+ fedcba98_76543210);
+ }
+ /* Update pointers for next iteration. */
+ sig += sig_stride;
+ mc_running_avg_y += mc_running_avg_y_stride;
+ running_avg_y += running_avg_y_stride;
+ }
+ {
+ // Update the sum of all pixel differences of this MB.
+ x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ vget_low_s64(v_sum_diff_total));
+ sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+ if (sum_diff > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ /* Tell above level that block was filtered. */
+ running_avg_y -= running_avg_y_stride * 16;
+ sig -= sig_stride * 16;
+
+ vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);
+
+ return FILTER_BLOCK;
+}
+
+int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg,
+ int mc_running_avg_stride,
+ unsigned char *running_avg,
+ int running_avg_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising) {
+ /* If motion_magnitude is small, making the denoiser more aggressive by
+ * increasing the adjustment for each level, level1 adjustment is
+ * increased, the deltas stay the same.
+ */
+ int shift_inc = (increase_denoising &&
+ motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3);
+
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+ int64x2_t v_sum_diff_total = vdupq_n_s64(0);
+ int r;
+
+ {
+ uint16x4_t v_sum_block = vdup_n_u16(0);
+
+ // Avoid denoising color signal if its close to average level.
+ for (r = 0; r < 8; ++r) {
+ const uint8x8_t v_sig = vld1_u8(sig);
+ const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig);
+ v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10);
+ sig += sig_stride;
+ }
+ sig -= sig_stride * 8;
+ {
+ const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block);
+ const uint64x1_t _76543210 = vpaddl_u32(_7654_3210);
+ const int sum_block =
+ vget_lane_s32(vreinterpret_s32_u64(_76543210), 0);
+ if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ /* Go over lines. */
+ for (r = 0; r < 4; ++r) {
+ /* Load inputs. */
+ const uint8x8_t v_sig_lo = vld1_u8(sig);
+ const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
+ const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
+ const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
+ const uint8x8_t v_mc_running_avg_hi =
+ vld1_u8(&mc_running_avg[mc_running_avg_stride]);
+ const uint8x16_t v_mc_running_avg =
+ vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
+ v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
+ v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
+ v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
+ v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
+ v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
+ v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
+ v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
+ v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+ v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+ v_abs_adjustment);
+
+ uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment);
+
+ /* Store results. */
+ vst1_u8(running_avg, vget_low_u8(v_running_avg));
+ vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg));
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+
+ const int32x4_t fedc_ba98_7654_3210 =
+ vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+
+ const int64x2_t fedcba98_76543210 =
+ vpaddlq_s32(fedc_ba98_7654_3210);
+
+ v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+ }
+
+ /* Update pointers for next iteration. */
+ sig += sig_stride * 2;
+ mc_running_avg += mc_running_avg_stride * 2;
+ running_avg += running_avg_stride * 2;
+ }
+
+
+ /* Too much adjustments => copy block. */
+ {
+ int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ vget_low_s64(v_sum_diff_total));
+ int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+ int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+ if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+ if (sum_diff > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // checK if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the accceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ sig -= sig_stride * 8;
+ mc_running_avg -= mc_running_avg_stride * 8;
+ running_avg -= running_avg_stride * 8;
+ for (r = 0; r < 4; ++r) {
+ const uint8x8_t v_sig_lo = vld1_u8(sig);
+ const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
+ const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
+ const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
+ const uint8x8_t v_mc_running_avg_hi =
+ vld1_u8(&mc_running_avg[mc_running_avg_stride]);
+ const uint8x16_t v_mc_running_avg =
+ vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
+ v_mc_running_avg);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
+ v_mc_running_avg);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
+ v_mc_running_avg);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment =
+ vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+ v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+ v_abs_adjustment);
+ const uint8x8_t v_running_avg_lo = vld1_u8(running_avg);
+ const uint8x8_t v_running_avg_hi =
+ vld1_u8(&running_avg[running_avg_stride]);
+ uint8x16_t v_running_avg =
+ vcombine_u8(v_running_avg_lo, v_running_avg_hi);
+
+ v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment);
+ v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment);
+
+ /* Store results. */
+ vst1_u8(running_avg, vget_low_u8(v_running_avg));
+ vst1_u8(&running_avg[running_avg_stride],
+ vget_high_u8(v_running_avg));
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+
+ const int16x8_t fe_dc_ba_98_76_54_32_10 =
+ vpaddlq_s8(v_sum_diff);
+ const int32x4_t fedc_ba98_7654_3210 =
+ vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 =
+ vpaddlq_s32(fedc_ba98_7654_3210);
+
+ v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
+ fedcba98_76543210);
+ }
+ /* Update pointers for next iteration. */
+ sig += sig_stride * 2;
+ mc_running_avg += mc_running_avg_stride * 2;
+ running_avg += running_avg_stride * 2;
+ }
+ {
+ // Update the sum of all pixel differences of this MB.
+ x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ vget_low_s64(v_sum_diff_total));
+ sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+ if (sum_diff > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ /* Tell above level that block was filtered. */
+ running_avg -= running_avg_stride * 8;
+ sig -= sig_stride * 8;
+
+ vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride);
+
+ return FILTER_BLOCK;
+}
diff --git a/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
new file mode 100644
index 000000000..e5824bfb2
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+static const uint16_t inv_zig_zag[16] = {
+ 1, 2, 6, 7,
+ 3, 5, 8, 13,
+ 4, 9, 12, 14,
+ 10, 11, 15, 16
+};
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
+ const int16x8_t one_q = vdupq_n_s16(-1),
+ z0 = vld1q_s16(b->coeff),
+ z1 = vld1q_s16(b->coeff + 8),
+ round0 = vld1q_s16(b->round),
+ round1 = vld1q_s16(b->round + 8),
+ quant0 = vld1q_s16(b->quant_fast),
+ quant1 = vld1q_s16(b->quant_fast + 8),
+ dequant0 = vld1q_s16(d->dequant),
+ dequant1 = vld1q_s16(d->dequant + 8);
+ const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
+ zig_zag1 = vld1q_u16(inv_zig_zag + 8);
+ int16x8_t x0, x1, sz0, sz1, y0, y1;
+ uint16x8_t eob0, eob1;
+ uint16x4_t eob_d16;
+ uint32x2_t eob_d32;
+ uint32x4_t eob_q32;
+
+ /* sign of z: z >> 15 */
+ sz0 = vshrq_n_s16(z0, 15);
+ sz1 = vshrq_n_s16(z1, 15);
+
+ /* x = abs(z) */
+ x0 = vabsq_s16(z0);
+ x1 = vabsq_s16(z1);
+
+ /* x += round */
+ x0 = vaddq_s16(x0, round0);
+ x1 = vaddq_s16(x1, round1);
+
+ /* y = 2 * (x * quant) >> 16 */
+ y0 = vqdmulhq_s16(x0, quant0);
+ y1 = vqdmulhq_s16(x1, quant1);
+
+ /* Compensate for doubling in vqdmulhq */
+ y0 = vshrq_n_s16(y0, 1);
+ y1 = vshrq_n_s16(y1, 1);
+
+ /* Restore sign bit */
+ y0 = veorq_s16(y0, sz0);
+ y1 = veorq_s16(y1, sz1);
+ x0 = vsubq_s16(y0, sz0);
+ x1 = vsubq_s16(y1, sz1);
+
+ /* find non-zero elements */
+ eob0 = vtstq_s16(x0, one_q);
+ eob1 = vtstq_s16(x1, one_q);
+
+ /* mask zig zag */
+ eob0 = vandq_u16(eob0, zig_zag0);
+ eob1 = vandq_u16(eob1, zig_zag1);
+
+ /* select the largest value */
+ eob0 = vmaxq_u16(eob0, eob1);
+ eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
+ eob_q32 = vmovl_u16(eob_d16);
+ eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
+ eob_d32 = vpmax_u32(eob_d32, eob_d32);
+
+ /* qcoeff = x */
+ vst1q_s16(d->qcoeff, x0);
+ vst1q_s16(d->qcoeff + 8, x1);
+
+ /* dqcoeff = x * dequant */
+ vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
+ vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
+
+ vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+}
diff --git a/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 000000000..391e5f990
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+ uint16x4_t d4u16;
+ int16x8_t q0s16, q1s16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+
+ d16s16 = vdup_n_s16(5352);
+ d17s16 = vdup_n_s16(2217);
+ q9s32 = vdupq_n_s32(14500);
+ q10s32 = vdupq_n_s32(7500);
+ q11s32 = vdupq_n_s32(12000);
+ q12s32 = vdupq_n_s32(51000);
+
+ // Part one
+ pitch >>= 1;
+ d0s16 = vld1_s16(input);
+ input += pitch;
+ d1s16 = vld1_s16(input);
+ input += pitch;
+ d2s16 = vld1_s16(input);
+ input += pitch;
+ d3s16 = vld1_s16(input);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ d4s16 = vshl_n_s16(d4s16, 3);
+ d5s16 = vshl_n_s16(d5s16, 3);
+ d6s16 = vshl_n_s16(d6s16, 3);
+ d7s16 = vshl_n_s16(d7s16, 3);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d2s16 = vsub_s16(d4s16, d5s16);
+
+ q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+ q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+ d1s16 = vshrn_n_s32(q9s32, 12);
+ d3s16 = vshrn_n_s32(q10s32, 12);
+
+ // Part two
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ d26s16 = vdup_n_s16(7);
+ d4s16 = vadd_s16(d4s16, d26s16);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d2s16 = vsub_s16(d4s16, d5s16);
+
+ q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+ dEmptys16 = vdup_n_s16(0);
+ d4u16 = vceq_s16(d7s16, dEmptys16);
+
+ d0s16 = vshr_n_s16(d0s16, 4);
+ d2s16 = vshr_n_s16(d2s16, 4);
+
+ q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+ d4u16 = vmvn_u16(d4u16);
+ d1s16 = vshrn_n_s32(q11s32, 16);
+ d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+ d3s16 = vshrn_n_s32(q12s32, 16);
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ return;
+}
+
+void vp8_short_fdct8x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+ uint16x4_t d28u16, d29u16;
+ uint16x8_t q14u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+ int16x8x2_t v2tmp0, v2tmp1;
+ int32x4x2_t v2tmp2, v2tmp3;
+
+ d16s16 = vdup_n_s16(5352);
+ d17s16 = vdup_n_s16(2217);
+ q9s32 = vdupq_n_s32(14500);
+ q10s32 = vdupq_n_s32(7500);
+
+ // Part one
+ pitch >>= 1;
+ q0s16 = vld1q_s16(input);
+ input += pitch;
+ q1s16 = vld1q_s16(input);
+ input += pitch;
+ q2s16 = vld1q_s16(input);
+ input += pitch;
+ q3s16 = vld1q_s16(input);
+
+ v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+ vreinterpretq_s32_s16(q2s16));
+ v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+ vreinterpretq_s32_s16(q3s16));
+ v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
+ vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
+ v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
+ vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
+
+ q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ q11s16 = vshlq_n_s16(q11s16, 3);
+ q12s16 = vshlq_n_s16(q12s16, 3);
+ q13s16 = vshlq_n_s16(q13s16, 3);
+ q14s16 = vshlq_n_s16(q14s16, 3);
+
+ q0s16 = vaddq_s16(q11s16, q12s16);
+ q2s16 = vsubq_s16(q11s16, q12s16);
+
+ q11s32 = q9s32;
+ q12s32 = q10s32;
+
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+ q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+ q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+ d2s16 = vshrn_n_s32(q9s32, 12);
+ d6s16 = vshrn_n_s32(q10s32, 12);
+ d3s16 = vshrn_n_s32(q11s32, 12);
+ d7s16 = vshrn_n_s32(q12s32, 12);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ // Part two
+ q9s32 = vdupq_n_s32(12000);
+ q10s32 = vdupq_n_s32(51000);
+
+ v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+ vreinterpretq_s32_s16(q2s16));
+ v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+ vreinterpretq_s32_s16(q3s16));
+ v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
+ vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
+ v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
+ vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
+
+ q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ q15s16 = vdupq_n_s16(7);
+ q11s16 = vaddq_s16(q11s16, q15s16);
+ q0s16 = vaddq_s16(q11s16, q12s16);
+ q1s16 = vsubq_s16(q11s16, q12s16);
+
+ q11s32 = q9s32;
+ q12s32 = q10s32;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d2s16 = vget_low_s16(q1s16);
+ d3s16 = vget_high_s16(q1s16);
+
+ d0s16 = vshr_n_s16(d0s16, 4);
+ d4s16 = vshr_n_s16(d1s16, 4);
+ d2s16 = vshr_n_s16(d2s16, 4);
+ d6s16 = vshr_n_s16(d3s16, 4);
+
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+ q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+ q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+ d1s16 = vshrn_n_s32(q9s32, 16);
+ d3s16 = vshrn_n_s32(q10s32, 16);
+ d5s16 = vshrn_n_s32(q11s32, 16);
+ d7s16 = vshrn_n_s32(q12s32, 16);
+
+ qEmptys16 = vdupq_n_s16(0);
+ q14u16 = vceqq_s16(q14s16, qEmptys16);
+ q14u16 = vmvnq_u16(q14u16);
+
+ d28u16 = vget_low_u16(q14u16);
+ d29u16 = vget_high_u16(q14u16);
+ d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+ d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ vst1q_s16(output + 16, q2s16);
+ vst1q_s16(output + 24, q3s16);
+ return;
+}
diff --git a/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c
new file mode 100644
index 000000000..d3ab7b165
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+void vp8_subtract_b_neon(
+ BLOCK *be,
+ BLOCKD *bd,
+ int pitch) {
+ unsigned char *src_ptr, *predictor;
+ int src_stride;
+ int16_t *src_diff;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint16x8_t q10u16, q11u16, q12u16, q13u16;
+
+ src_ptr = *be->base_src + be->src;
+ src_stride = be->src_stride;
+ predictor = bd->predictor;
+
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d4u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d6u8 = vld1_u8(src_ptr);
+
+ d1u8 = vld1_u8(predictor);
+ predictor += pitch;
+ d3u8 = vld1_u8(predictor);
+ predictor += pitch;
+ d5u8 = vld1_u8(predictor);
+ predictor += pitch;
+ d7u8 = vld1_u8(predictor);
+
+ q10u16 = vsubl_u8(d0u8, d1u8);
+ q11u16 = vsubl_u8(d2u8, d3u8);
+ q12u16 = vsubl_u8(d4u8, d5u8);
+ q13u16 = vsubl_u8(d6u8, d7u8);
+
+ src_diff = be->src_diff;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
+ src_diff += pitch;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
+ src_diff += pitch;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
+ src_diff += pitch;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
+ return;
+}
+
+void vp8_subtract_mby_neon(
+ int16_t *diff,
+ unsigned char *src,
+ int src_stride,
+ unsigned char *pred,
+ int pred_stride) {
+ int i;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ for (i = 0; i < 8; i++) { // subtract_mby_loop
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(pred);
+ pred += pred_stride;
+ q3u8 = vld1q_u8(pred);
+ pred += pred_stride;
+
+ q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
+ q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
+
+ vst1q_u16((uint16_t *)diff, q8u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q9u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q10u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q11u16);
+ diff += 8;
+ }
+ return;
+}
+
+void vp8_subtract_mbuv_neon(
+ int16_t *diff,
+ unsigned char *usrc,
+ unsigned char *vsrc,
+ int src_stride,
+ unsigned char *upred,
+ unsigned char *vpred,
+ int pred_stride) {
+ int i, j;
+ unsigned char *src_ptr, *pred_ptr;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ diff += 256;
+ for (i = 0; i < 2; i++) {
+ if (i == 0) {
+ src_ptr = usrc;
+ pred_ptr = upred;
+ } else if (i == 1) {
+ src_ptr = vsrc;
+ pred_ptr = vpred;
+ }
+
+ for (j = 0; j < 2; j++) {
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d1u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d3u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+ d4u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d5u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+ d6u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d7u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+
+ q8u16 = vsubl_u8(d0u8, d1u8);
+ q9u16 = vsubl_u8(d2u8, d3u8);
+ q10u16 = vsubl_u8(d4u8, d5u8);
+ q11u16 = vsubl_u8(d6u8, d7u8);
+
+ vst1q_u16((uint16_t *)diff, q8u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q9u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q10u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q11u16);
+ diff += 8;
+ }
+ }
+ return;
+}
diff --git a/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
new file mode 100644
index 000000000..5ad946500
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+#include "./vp8_rtcd.h"
+void vp8_short_walsh4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ vp8_short_walsh4x4_c(input, output, pitch);
+}
+#else
+void vp8_short_walsh4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ uint16x4_t d16u16;
+ int16x8_t q0s16, q1s16;
+ int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
+ int32x4_t q9s32, q10s32, q11s32, q15s32;
+ uint32x4_t q8u32, q9u32, q10u32, q11u32;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+
+ dEmptys16 = vdup_n_s16(0);
+ qEmptys32 = vdupq_n_s32(0);
+ q15s32 = vdupq_n_s32(3);
+
+ d0s16 = vld1_s16(input);
+ input += pitch/2;
+ d1s16 = vld1_s16(input);
+ input += pitch/2;
+ d2s16 = vld1_s16(input);
+ input += pitch/2;
+ d3s16 = vld1_s16(input);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
+
+ d4s16 = vshl_n_s16(d4s16, 2);
+ d5s16 = vshl_n_s16(d5s16, 2);
+ d6s16 = vshl_n_s16(d6s16, 2);
+ d7s16 = vshl_n_s16(d7s16, 2);
+
+ d16u16 = vceq_s16(d4s16, dEmptys16);
+ d16u16 = vmvn_u16(d16u16);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d3s16 = vsub_s16(d4s16, d5s16);
+ d1s16 = vadd_s16(d7s16, d6s16);
+ d2s16 = vsub_s16(d7s16, d6s16);
+
+ d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
+
+ // Second for-loop
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp2.val[1])); // d3
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp2.val[0])); // d1
+
+ q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+ q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+ q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+ q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+
+ q0s32 = vaddq_s32(q8s32, q9s32);
+ q1s32 = vaddq_s32(q11s32, q10s32);
+ q2s32 = vsubq_s32(q11s32, q10s32);
+ q3s32 = vsubq_s32(q8s32, q9s32);
+
+ q8u32 = vcltq_s32(q0s32, qEmptys32);
+ q9u32 = vcltq_s32(q1s32, qEmptys32);
+ q10u32 = vcltq_s32(q2s32, qEmptys32);
+ q11u32 = vcltq_s32(q3s32, qEmptys32);
+
+ q8s32 = vreinterpretq_s32_u32(q8u32);
+ q9s32 = vreinterpretq_s32_u32(q9u32);
+ q10s32 = vreinterpretq_s32_u32(q10u32);
+ q11s32 = vreinterpretq_s32_u32(q11u32);
+
+ q0s32 = vsubq_s32(q0s32, q8s32);
+ q1s32 = vsubq_s32(q1s32, q9s32);
+ q2s32 = vsubq_s32(q2s32, q10s32);
+ q3s32 = vsubq_s32(q3s32, q11s32);
+
+ q8s32 = vaddq_s32(q0s32, q15s32);
+ q9s32 = vaddq_s32(q1s32, q15s32);
+ q10s32 = vaddq_s32(q2s32, q15s32);
+ q11s32 = vaddq_s32(q3s32, q15s32);
+
+ d0s16 = vshrn_n_s32(q8s32, 3);
+ d1s16 = vshrn_n_s32(q9s32, 3);
+ d2s16 = vshrn_n_s32(q10s32, 3);
+ d3s16 = vshrn_n_s32(q11s32, 3);
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ return;
+}
+#endif // VPX_INCOMPATIBLE_GCC