summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
blob: 1a8fed71061cf28a9c2c987c6333a28a14559c23 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
#define AOM_DSP_X86_TXFM_COMMON_AVX2_H

#include <immintrin.h>

#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/common_avx2.h"

#define pair256_set_epi16(a, b)                                            \
  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))

#define pair256_set_epi32(a, b)                                                \
  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
                   (int)(b), (int)(a))

static INLINE void mm256_reverse_epi16(__m256i *u) {
  const __m256i control = _mm256_set_epi16(
      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
  __m256i v = _mm256_shuffle_epi8(*u, control);
  *u = _mm256_permute2x128_si256(v, v, 1);
}

static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
                                 const __m256i *cospi) {
  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
  __m256i y0 = _mm256_madd_epi16(*a0, *cospi);
  __m256i y1 = _mm256_madd_epi16(*a1, *cospi);

  y0 = _mm256_add_epi32(y0, dct_rounding);
  y1 = _mm256_add_epi32(y1, dct_rounding);
  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);

  return _mm256_packs_epi32(y0, y1);
}

static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
  const __m256i zero = _mm256_setzero_si256();
  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
  __m256i u0, u1;
  int i = 0;

  while (i < 16) {
    in[i] = _mm256_slli_epi16(in[i], 1);

    u0 = _mm256_unpacklo_epi16(zero, in[i]);
    u1 = _mm256_unpackhi_epi16(zero, in[i]);

    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);

    u0 = _mm256_add_epi32(u0, dct_const_rounding);
    u1 = _mm256_add_epi32(u1, dct_const_rounding);

    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
    in[i] = _mm256_packs_epi32(u0, u1);
    i++;
  }
}

#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H