summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86/pvq_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/common/x86/pvq_sse4.c')
-rw-r--r--third_party/aom/av1/common/x86/pvq_sse4.c252
1 files changed, 0 insertions, 252 deletions
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
deleted file mode 100644
index b3ed9efdf..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include <float.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/pvq_sse4.h"
-#include "../odintrin.h"
-#include "av1/common/pvq.h"
-
-#define EPSILON 1e-15f
-
-static __m128 horizontal_sum_ps(__m128 x) {
- x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
- x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
- return x;
-}
-
-static __m128i horizontal_sum_epi32(__m128i x) {
- x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
- x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
- return x;
-}
-
-static INLINE float rsqrtf(float x) {
- float y;
- _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
- return y;
-}
-
-/** Find the codepoint on the given PSphere closest to the desired
- * vector. This is a float-precision PVQ search just to make sure
- * our tests aren't limited by numerical accuracy. It's close to the
- * pvq_search_rdo_double_c implementation, but is not bit accurate and
- * it performs slightly worse on PSNR. One reason is that this code runs
- * more RDO iterations than the C code. It also uses single precision
- * floating point math, whereas the C version uses double precision.
- *
- * @param [in] xcoeff input vector to quantize (x in the math doc)
- * @param [in] n number of dimensions
- * @param [in] k number of pulses
- * @param [out] ypulse optimal codevector found (y in the math doc)
- * @param [in] g2 multiplier for the distortion (typically squared
- * gain units)
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in] prev_k number of pulses already in ypulse that we should
- * reuse for the search (or 0 for a new search)
- * @return cosine distance between x and y (between 0 and 1)
- */
-double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
- int *ypulse, double g2,
- double pvq_norm_lambda, int prev_k) {
- int i, j;
- int reuse_pulses = prev_k > 0 && prev_k <= k;
- /* TODO - This blows our 8kB stack space budget and should be fixed when
- converting PVQ to fixed point. */
- float xx = 0, xy = 0, yy = 0;
- float x[MAXN + 3];
- float y[MAXN + 3];
- float sign_y[MAXN + 3];
- for (i = 0; i < n; i++) {
- float tmp = (float)xcoeff[i];
- xx += tmp * tmp;
- x[i] = xcoeff[i];
- }
-
- x[n] = x[n + 1] = x[n + 2] = 0;
- ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
-
- __m128 sums = _mm_setzero_ps();
- for (i = 0; i < n; i += 4) {
- __m128 x4 = _mm_loadu_ps(&x[i]);
- __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
- /* Save the sign, we'll put it back later. */
- _mm_storeu_ps(&sign_y[i], s4);
- /* Get rid of the sign. */
- x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
- sums = _mm_add_ps(sums, x4);
- if (!reuse_pulses) {
- /* Clear y and ypulse in case we don't do the projection. */
- _mm_storeu_ps(&y[i], _mm_setzero_ps());
- _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
- }
- _mm_storeu_ps(&x[i], x4);
- }
- sums = horizontal_sum_ps(sums);
- int pulses_left = k;
- {
- __m128i pulses_sum;
- __m128 yy4, xy4;
- xy4 = yy4 = _mm_setzero_ps();
- pulses_sum = _mm_setzero_si128();
- if (reuse_pulses) {
- /* We reuse pulses from a previous search so we don't have to search them
- again. */
- for (j = 0; j < n; j += 4) {
- __m128 x4, y4;
- __m128i iy4;
- iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
- pulses_sum = _mm_add_epi32(pulses_sum, iy4);
- _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
- y4 = _mm_cvtepi32_ps(iy4);
- x4 = _mm_loadu_ps(&x[j]);
- xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
- yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
- /* Double the y[] vector so we don't have to do it in the search loop.
- */
- _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
- }
- pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
- xy4 = horizontal_sum_ps(xy4);
- xy = _mm_cvtss_f32(xy4);
- yy4 = horizontal_sum_ps(yy4);
- yy = _mm_cvtss_f32(yy4);
- } else if (k > (n >> 1)) {
- /* Do a pre-search by projecting on the pyramid. */
- __m128 rcp4;
- float sum = _mm_cvtss_f32(sums);
- /* If x is too small, just replace it with a pulse at 0. This prevents
- infinities and NaNs from causing too many pulses to be allocated. Here,
- 64 is an
- approximation of infinity. */
- if (sum <= EPSILON) {
- x[0] = 1.f;
- for (i = 1; i < n; i++) {
- x[i] = 0;
- }
- sums = _mm_set_ps1(1.f);
- }
- /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
- rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
- xy4 = yy4 = _mm_setzero_ps();
- pulses_sum = _mm_setzero_si128();
- for (j = 0; j < n; j += 4) {
- __m128 rx4, x4, y4;
- __m128i iy4;
- x4 = _mm_loadu_ps(&x[j]);
- rx4 = _mm_mul_ps(x4, rcp4);
- iy4 = _mm_cvttps_epi32(rx4);
- pulses_sum = _mm_add_epi32(pulses_sum, iy4);
- _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
- y4 = _mm_cvtepi32_ps(iy4);
- xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
- yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
- /* Double the y[] vector so we don't have to do it in the search loop.
- */
- _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
- }
- pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
- xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
- yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
- }
- x[n] = x[n + 1] = x[n + 2] = -100;
- y[n] = y[n + 1] = y[n + 2] = 100;
- }
-
- /* This should never happen. */
- OD_ASSERT(pulses_left <= n + 3);
-
- float lambda_delta_rate[MAXN + 3];
- if (pulses_left) {
- /* Hoist lambda to avoid the multiply in the loop. */
- float lambda =
- 0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
- float delta_rate = 3.f / n;
- __m128 count = _mm_set_ps(3, 2, 1, 0);
- for (i = 0; i < n; i += 4) {
- _mm_storeu_ps(&lambda_delta_rate[i],
- _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
- count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
- }
- }
- lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
- 1e30f;
-
- for (i = 0; i < pulses_left; i++) {
- int best_id = 0;
- __m128 xy4, yy4;
- __m128 max, max2;
- __m128i count;
- __m128i pos;
-
- /* The squared magnitude term gets added anyway, so we might as well
- add it outside the loop. */
- yy = yy + 1;
- xy4 = _mm_load1_ps(&xy);
- yy4 = _mm_load1_ps(&yy);
- max = _mm_setzero_ps();
- pos = _mm_setzero_si128();
- count = _mm_set_epi32(3, 2, 1, 0);
- for (j = 0; j < n; j += 4) {
- __m128 x4, y4, r4;
- x4 = _mm_loadu_ps(&x[j]);
- y4 = _mm_loadu_ps(&y[j]);
- x4 = _mm_add_ps(x4, xy4);
- y4 = _mm_add_ps(y4, yy4);
- y4 = _mm_rsqrt_ps(y4);
- r4 = _mm_mul_ps(x4, y4);
- /* Subtract lambda. */
- r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
- /* Update the index of the max. */
- pos = _mm_max_epi16(
- pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
- /* Update the max. */
- max = _mm_max_ps(max, r4);
- /* Update the indices (+4) */
- count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
- }
- /* Horizontal max. */
- max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
- max2 =
- _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
- /* Now that max2 contains the max at all positions, look at which value(s)
- of the
- partial max is equal to the global max. */
- pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
- pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
- pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
- best_id = _mm_cvtsi128_si32(pos);
- OD_ASSERT(best_id < n);
- /* Updating the sums of the new pulse(s) */
- xy = xy + x[best_id];
- /* We're multiplying y[j] by two so we don't have to do it here. */
- yy = yy + y[best_id];
- /* Only now that we've made the final choice, update y/ypulse. */
- /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
- y[best_id] += 2;
- ypulse[best_id]++;
- }
-
- /* Put the original sign back. */
- for (i = 0; i < n; i += 4) {
- __m128i y4;
- __m128i s4;
- y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
- s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
- y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
- _mm_storeu_si128((__m128i *)&ypulse[i], y4);
- }
- return xy * rsqrtf(xx * yy + FLT_MIN);
-}