summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86/pvq_sse4.c
blob: b3ed9efdf4e8528fa9abef6d23cb5800caa22114 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <smmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <float.h>

#include "./av1_rtcd.h"
#include "av1/common/x86/pvq_sse4.h"
#include "../odintrin.h"
#include "av1/common/pvq.h"

#define EPSILON 1e-15f

static __m128 horizontal_sum_ps(__m128 x) {
  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
  return x;
}

static __m128i horizontal_sum_epi32(__m128i x) {
  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
  return x;
}

static INLINE float rsqrtf(float x) {
  float y;
  _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
  return y;
}

/** Find the codepoint on the given PSphere closest to the desired
 * vector. This is a float-precision PVQ search just to make sure
 * our tests aren't limited by numerical accuracy. It's close to the
 * pvq_search_rdo_double_c implementation, but is not bit accurate and
 * it performs slightly worse on PSNR. One reason is that this code runs
 * more RDO iterations than the C code. It also uses single precision
 * floating point math, whereas the C version uses double precision.
 *
 * @param [in]      xcoeff  input vector to quantize (x in the math doc)
 * @param [in]      n       number of dimensions
 * @param [in]      k       number of pulses
 * @param [out]     ypulse  optimal codevector found (y in the math doc)
 * @param [in]      g2      multiplier for the distortion (typically squared
 *                          gain units)
 * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
 * @param [in]      prev_k  number of pulses already in ypulse that we should
 *                          reuse for the search (or 0 for a new search)
 * @return                  cosine distance between x and y (between 0 and 1)
 */
double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
                                    int *ypulse, double g2,
                                    double pvq_norm_lambda, int prev_k) {
  int i, j;
  int reuse_pulses = prev_k > 0 && prev_k <= k;
  /* TODO - This blows our 8kB stack space budget and should be fixed when
   converting PVQ to fixed point. */
  float xx = 0, xy = 0, yy = 0;
  float x[MAXN + 3];
  float y[MAXN + 3];
  float sign_y[MAXN + 3];
  for (i = 0; i < n; i++) {
    float tmp = (float)xcoeff[i];
    xx += tmp * tmp;
    x[i] = xcoeff[i];
  }

  x[n] = x[n + 1] = x[n + 2] = 0;
  ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;

  __m128 sums = _mm_setzero_ps();
  for (i = 0; i < n; i += 4) {
    __m128 x4 = _mm_loadu_ps(&x[i]);
    __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
    /* Save the sign, we'll put it back later. */
    _mm_storeu_ps(&sign_y[i], s4);
    /* Get rid of the sign. */
    x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
    sums = _mm_add_ps(sums, x4);
    if (!reuse_pulses) {
      /* Clear y and ypulse in case we don't do the projection. */
      _mm_storeu_ps(&y[i], _mm_setzero_ps());
      _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
    }
    _mm_storeu_ps(&x[i], x4);
  }
  sums = horizontal_sum_ps(sums);
  int pulses_left = k;
  {
    __m128i pulses_sum;
    __m128 yy4, xy4;
    xy4 = yy4 = _mm_setzero_ps();
    pulses_sum = _mm_setzero_si128();
    if (reuse_pulses) {
      /* We reuse pulses from a previous search so we don't have to search them
          again. */
      for (j = 0; j < n; j += 4) {
        __m128 x4, y4;
        __m128i iy4;
        iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
        y4 = _mm_cvtepi32_ps(iy4);
        x4 = _mm_loadu_ps(&x[j]);
        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
        /* Double the y[] vector so we don't have to do it in the search loop.
         */
        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
      }
      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
      xy4 = horizontal_sum_ps(xy4);
      xy = _mm_cvtss_f32(xy4);
      yy4 = horizontal_sum_ps(yy4);
      yy = _mm_cvtss_f32(yy4);
    } else if (k > (n >> 1)) {
      /* Do a pre-search by projecting on the pyramid. */
      __m128 rcp4;
      float sum = _mm_cvtss_f32(sums);
      /* If x is too small, just replace it with a pulse at 0. This prevents
         infinities and NaNs from causing too many pulses to be allocated. Here,
         64 is an
         approximation of infinity. */
      if (sum <= EPSILON) {
        x[0] = 1.f;
        for (i = 1; i < n; i++) {
          x[i] = 0;
        }
        sums = _mm_set_ps1(1.f);
      }
      /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
      rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
      xy4 = yy4 = _mm_setzero_ps();
      pulses_sum = _mm_setzero_si128();
      for (j = 0; j < n; j += 4) {
        __m128 rx4, x4, y4;
        __m128i iy4;
        x4 = _mm_loadu_ps(&x[j]);
        rx4 = _mm_mul_ps(x4, rcp4);
        iy4 = _mm_cvttps_epi32(rx4);
        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
        y4 = _mm_cvtepi32_ps(iy4);
        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
        /* Double the y[] vector so we don't have to do it in the search loop.
         */
        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
      }
      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
      xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
      yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
    }
    x[n] = x[n + 1] = x[n + 2] = -100;
    y[n] = y[n + 1] = y[n + 2] = 100;
  }

  /* This should never happen. */
  OD_ASSERT(pulses_left <= n + 3);

  float lambda_delta_rate[MAXN + 3];
  if (pulses_left) {
    /* Hoist lambda to avoid the multiply in the loop. */
    float lambda =
        0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
    float delta_rate = 3.f / n;
    __m128 count = _mm_set_ps(3, 2, 1, 0);
    for (i = 0; i < n; i += 4) {
      _mm_storeu_ps(&lambda_delta_rate[i],
                    _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
      count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
    }
  }
  lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
      1e30f;

  for (i = 0; i < pulses_left; i++) {
    int best_id = 0;
    __m128 xy4, yy4;
    __m128 max, max2;
    __m128i count;
    __m128i pos;

    /* The squared magnitude term gets added anyway, so we might as well
        add it outside the loop. */
    yy = yy + 1;
    xy4 = _mm_load1_ps(&xy);
    yy4 = _mm_load1_ps(&yy);
    max = _mm_setzero_ps();
    pos = _mm_setzero_si128();
    count = _mm_set_epi32(3, 2, 1, 0);
    for (j = 0; j < n; j += 4) {
      __m128 x4, y4, r4;
      x4 = _mm_loadu_ps(&x[j]);
      y4 = _mm_loadu_ps(&y[j]);
      x4 = _mm_add_ps(x4, xy4);
      y4 = _mm_add_ps(y4, yy4);
      y4 = _mm_rsqrt_ps(y4);
      r4 = _mm_mul_ps(x4, y4);
      /* Subtract lambda. */
      r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
      /* Update the index of the max. */
      pos = _mm_max_epi16(
          pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
      /* Update the max. */
      max = _mm_max_ps(max, r4);
      /* Update the indices (+4) */
      count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
    }
    /* Horizontal max. */
    max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
    max2 =
        _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
    /* Now that max2 contains the max at all positions, look at which value(s)
       of the
        partial max is equal to the global max. */
    pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
    pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
    pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
    best_id = _mm_cvtsi128_si32(pos);
    OD_ASSERT(best_id < n);
    /* Updating the sums of the new pulse(s) */
    xy = xy + x[best_id];
    /* We're multiplying y[j] by two so we don't have to do it here. */
    yy = yy + y[best_id];
    /* Only now that we've made the final choice, update y/ypulse. */
    /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
    y[best_id] += 2;
    ypulse[best_id]++;
  }

  /* Put the original sign back. */
  for (i = 0; i < n; i += 4) {
    __m128i y4;
    __m128i s4;
    y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
    s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
    y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
    _mm_storeu_si128((__m128i *)&ypulse[i], y4);
  }
  return xy * rsqrtf(xx * yy + FLT_MIN);
}