summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
blob: 26c5cfe59b734a3efd565534c64edfbd1d294fc5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H

#include <immintrin.h>

#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_avx2.h"

static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
  if (sizeof(tran_low_t) == 4) {
    *in = _mm256_setr_epi16(
        (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
        (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
        (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
        (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
        (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
        (int16_t)coeff[15]);
  } else {
    *in = _mm256_loadu_si256((const __m256i *)coeff);
  }
}

static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
  int i = 0;
  while (i < 16) {
    load_coeff(coeff + (i << 4), &in[i]);
    i += 1;
  }
}

static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
  const __m128i zero = _mm_setzero_si128();
  __m128i x = _mm_loadu_si128((__m128i const *)output);
  __m128i p0 = _mm_unpacklo_epi8(x, zero);
  __m128i p1 = _mm_unpackhi_epi8(x, zero);

  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
  x = _mm_packus_epi16(p0, p1);
  _mm_storeu_si128((__m128i *)output, x);
}

#define IDCT_ROUNDING_POS (6)
static INLINE void store_buffer_16xN(__m256i *in, const int stride,
                                     uint8_t *output, int num) {
  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
  int i = 0;

  while (i < num) {
    in[i] = _mm256_adds_epi16(in[i], rounding);
    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
    recon_and_store(&in[i], output + i * stride);
    i += 1;
  }
}

static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
                                     const __m256i *c0, const __m256i *c1,
                                     __m256i *b0, __m256i *b1) {
  __m256i x0, x1;
  x0 = _mm256_unpacklo_epi16(*a0, *a1);
  x1 = _mm256_unpackhi_epi16(*a0, *a1);
  *b0 = butter_fly(&x0, &x1, c0);
  *b1 = butter_fly(&x0, &x1, c1);
}

void av1_idct16_avx2(__m256i *in);

#endif  // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H