summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm')
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm379
1 files changed, 0 insertions, 379 deletions
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
deleted file mode 100644
index c1fb259a1..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192: times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
-pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585, 11585
-TRANSFORM_COEFFS 15137, 6270
-TRANSFORM_COEFFS 16069, 3196
-TRANSFORM_COEFFS 9102, 13623
-
-%macro STORE_OUTPUT 2 ; index, result
- ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- ; _mm_store_si128((__m128i *)(dst_ptr), out0);
- ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
- pxor m11, m11
- pcmpgtw m11, m%2
- movdqa m12, m%2
- punpcklwd m%2, m11
- punpckhwd m12, m11
- mova [outputq + 4*%1 + 0], m%2
- mova [outputq + 4*%1 + 16], m12
-%endmacro
-
-SECTION .text
-
-%if ARCH_X86_64
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
- mova m8, [GLOBAL(pd_8192)]
- mova m12, [GLOBAL(pw_11585x2)]
-
- lea r3, [2 * strideq]
- lea r4, [4 * strideq]
- mova m0, [inputq]
- mova m1, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m2, [inputq]
- mova m3, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m4, [inputq]
- mova m5, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m6, [inputq]
- mova m7, [inputq + r3]
-
- ; left shift by 2 to increase forward transformation precision
- psllw m0, 2
- psllw m1, 2
- psllw m2, 2
- psllw m3, 2
- psllw m4, 2
- psllw m5, 2
- psllw m6, 2
- psllw m7, 2
-
- ; column transform
- ; stage 1
- paddw m10, m0, m7
- psubw m0, m7
-
- paddw m9, m1, m6
- psubw m1, m6
-
- paddw m7, m2, m5
- psubw m2, m5
-
- paddw m6, m3, m4
- psubw m3, m4
-
- ; stage 2
- paddw m5, m9, m7
- psubw m9, m7
-
- paddw m4, m10, m6
- psubw m10, m6
-
- paddw m7, m1, m2
- psubw m1, m2
-
- ; stage 3
- paddw m6, m4, m5
- psubw m4, m5
-
- pmulhrsw m1, m12
- pmulhrsw m7, m12
-
- ; sin(pi / 8), cos(pi / 8)
- punpcklwd m2, m10, m9
- punpckhwd m10, m9
- pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
- pmaddwd m2, [GLOBAL(pw_6270_m15137)]
- pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
- pmaddwd m10, [GLOBAL(pw_6270_m15137)]
- paddd m5, m8
- paddd m2, m8
- paddd m9, m8
- paddd m10, m8
- psrad m5, 14
- psrad m2, 14
- psrad m9, 14
- psrad m10, 14
- packssdw m5, m9
- packssdw m2, m10
-
- pmulhrsw m6, m12
- pmulhrsw m4, m12
-
- paddw m9, m3, m1
- psubw m3, m1
-
- paddw m10, m0, m7
- psubw m0, m7
-
- ; stage 4
- ; sin(pi / 16), cos(pi / 16)
- punpcklwd m1, m10, m9
- punpckhwd m10, m9
- pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
- pmaddwd m1, [GLOBAL(pw_3196_m16069)]
- pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
- pmaddwd m10, [GLOBAL(pw_3196_m16069)]
- paddd m7, m8
- paddd m1, m8
- paddd m9, m8
- paddd m10, m8
- psrad m7, 14
- psrad m1, 14
- psrad m9, 14
- psrad m10, 14
- packssdw m7, m9
- packssdw m1, m10
-
- ; sin(3 * pi / 16), cos(3 * pi / 16)
- punpcklwd m11, m0, m3
- punpckhwd m0, m3
- pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
- pmaddwd m11, [GLOBAL(pw_13623_m9102)]
- pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
- pmaddwd m0, [GLOBAL(pw_13623_m9102)]
- paddd m9, m8
- paddd m11, m8
- paddd m3, m8
- paddd m0, m8
- psrad m9, 14
- psrad m11, 14
- psrad m3, 14
- psrad m0, 14
- packssdw m9, m3
- packssdw m11, m0
-
- ; transpose
- ; stage 1
- punpcklwd m0, m6, m7
- punpcklwd m3, m5, m11
- punpckhwd m6, m7
- punpckhwd m5, m11
- punpcklwd m7, m4, m9
- punpcklwd m10, m2, m1
- punpckhwd m4, m9
- punpckhwd m2, m1
-
- ; stage 2
- punpckldq m9, m0, m3
- punpckldq m1, m6, m5
- punpckhdq m0, m3
- punpckhdq m6, m5
- punpckldq m3, m7, m10
- punpckldq m5, m4, m2
- punpckhdq m7, m10
- punpckhdq m4, m2
-
- ; stage 3
- punpcklqdq m10, m9, m3
- punpckhqdq m9, m3
- punpcklqdq m2, m0, m7
- punpckhqdq m0, m7
- punpcklqdq m3, m1, m5
- punpckhqdq m1, m5
- punpcklqdq m7, m6, m4
- punpckhqdq m6, m4
-
- ; row transform
- ; stage 1
- paddw m5, m10, m6
- psubw m10, m6
-
- paddw m4, m9, m7
- psubw m9, m7
-
- paddw m6, m2, m1
- psubw m2, m1
-
- paddw m7, m0, m3
- psubw m0, m3
-
- ;stage 2
- paddw m1, m5, m7
- psubw m5, m7
-
- paddw m3, m4, m6
- psubw m4, m6
-
- paddw m7, m9, m2
- psubw m9, m2
-
- ; stage 3
- punpcklwd m6, m1, m3
- punpckhwd m1, m3
- pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
- pmaddwd m6, [GLOBAL(pw_11585_m11585)]
- pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
- pmaddwd m1, [GLOBAL(pw_11585_m11585)]
- paddd m2, m8
- paddd m6, m8
- paddd m3, m8
- paddd m1, m8
- psrad m2, 14
- psrad m6, 14
- psrad m3, 14
- psrad m1, 14
- packssdw m2, m3
- packssdw m6, m1
-
- pmulhrsw m7, m12
- pmulhrsw m9, m12
-
- punpcklwd m3, m5, m4
- punpckhwd m5, m4
- pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
- pmaddwd m3, [GLOBAL(pw_6270_m15137)]
- pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
- pmaddwd m5, [GLOBAL(pw_6270_m15137)]
- paddd m1, m8
- paddd m3, m8
- paddd m4, m8
- paddd m5, m8
- psrad m1, 14
- psrad m3, 14
- psrad m4, 14
- psrad m5, 14
- packssdw m1, m4
- packssdw m3, m5
-
- paddw m4, m0, m9
- psubw m0, m9
-
- paddw m5, m10, m7
- psubw m10, m7
-
- ; stage 4
- punpcklwd m9, m5, m4
- punpckhwd m5, m4
- pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
- pmaddwd m9, [GLOBAL(pw_3196_m16069)]
- pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
- pmaddwd m5, [GLOBAL(pw_3196_m16069)]
- paddd m7, m8
- paddd m9, m8
- paddd m4, m8
- paddd m5, m8
- psrad m7, 14
- psrad m9, 14
- psrad m4, 14
- psrad m5, 14
- packssdw m7, m4
- packssdw m9, m5
-
- punpcklwd m4, m10, m0
- punpckhwd m10, m0
- pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
- pmaddwd m4, [GLOBAL(pw_13623_m9102)]
- pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
- pmaddwd m10, [GLOBAL(pw_13623_m9102)]
- paddd m5, m8
- paddd m4, m8
- paddd m0, m8
- paddd m10, m8
- psrad m5, 14
- psrad m4, 14
- psrad m0, 14
- psrad m10, 14
- packssdw m5, m0
- packssdw m4, m10
-
- ; transpose
- ; stage 1
- punpcklwd m0, m2, m7
- punpcklwd m10, m1, m4
- punpckhwd m2, m7
- punpckhwd m1, m4
- punpcklwd m7, m6, m5
- punpcklwd m4, m3, m9
- punpckhwd m6, m5
- punpckhwd m3, m9
-
- ; stage 2
- punpckldq m5, m0, m10
- punpckldq m9, m2, m1
- punpckhdq m0, m10
- punpckhdq m2, m1
- punpckldq m10, m7, m4
- punpckldq m1, m6, m3
- punpckhdq m7, m4
- punpckhdq m6, m3
-
- ; stage 3
- punpcklqdq m4, m5, m10
- punpckhqdq m5, m10
- punpcklqdq m3, m0, m7
- punpckhqdq m0, m7
- punpcklqdq m10, m9, m1
- punpckhqdq m9, m1
- punpcklqdq m7, m2, m6
- punpckhqdq m2, m6
-
- psraw m1, m4, 15
- psraw m6, m5, 15
- psraw m8, m3, 15
- psraw m11, m0, 15
-
- psubw m4, m1
- psubw m5, m6
- psubw m3, m8
- psubw m0, m11
-
- psraw m4, 1
- psraw m5, 1
- psraw m3, 1
- psraw m0, 1
-
- psraw m1, m10, 15
- psraw m6, m9, 15
- psraw m8, m7, 15
- psraw m11, m2, 15
-
- psubw m10, m1
- psubw m9, m6
- psubw m7, m8
- psubw m2, m11
-
- psraw m10, 1
- psraw m9, 1
- psraw m7, 1
- psraw m2, 1
-
- STORE_OUTPUT 0, 4
- STORE_OUTPUT 8, 5
- STORE_OUTPUT 16, 3
- STORE_OUTPUT 24, 0
- STORE_OUTPUT 32, 10
- STORE_OUTPUT 40, 9
- STORE_OUTPUT 48, 7
- STORE_OUTPUT 56, 2
-
- RET
-%endif