diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm')
-rw-r--r-- | third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 379 |
1 files changed, 0 insertions, 379 deletions
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm deleted file mode 100644 index c1fb259a1..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ /dev/null @@ -1,379 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 -pd_8192: times 4 dd 8192 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 -%endmacro - -TRANSFORM_COEFFS 11585, 11585 -TRANSFORM_COEFFS 15137, 6270 -TRANSFORM_COEFFS 16069, 3196 -TRANSFORM_COEFFS 9102, 13623 - -%macro STORE_OUTPUT 2 ; index, result - ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - ; _mm_store_si128((__m128i *)(dst_ptr), out0); - ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); - pxor m11, m11 - pcmpgtw m11, m%2 - movdqa m12, m%2 - punpcklwd m%2, m11 - punpckhwd m12, m11 - mova [outputq + 4*%1 + 0], m%2 - mova [outputq + 4*%1 + 16], m12 -%endmacro - -SECTION .text - -%if ARCH_X86_64 -INIT_XMM ssse3 -cglobal fdct8x8, 3, 5, 13, input, output, stride - - mova m8, [GLOBAL(pd_8192)] - mova m12, [GLOBAL(pw_11585x2)] - - lea r3, [2 * strideq] - lea r4, [4 * strideq] - mova m0, [inputq] - mova m1, [inputq + r3] - lea inputq, [inputq + r4] - mova m2, [inputq] - mova m3, [inputq + r3] - lea inputq, [inputq + r4] - mova m4, [inputq] - mova m5, [inputq + r3] - lea inputq, [inputq + r4] - mova m6, [inputq] - mova m7, [inputq + r3] - - ; left shift by 2 to increase forward transformation precision - psllw m0, 2 - psllw m1, 2 - psllw m2, 2 - psllw m3, 2 - psllw m4, 2 - psllw m5, 2 - psllw m6, 2 - psllw m7, 2 - - ; column transform - ; stage 1 - paddw m10, m0, m7 - psubw m0, m7 - - paddw m9, m1, m6 - psubw m1, m6 - - paddw m7, m2, m5 - psubw m2, m5 - - paddw m6, m3, m4 - psubw m3, m4 - - ; stage 2 - paddw m5, m9, m7 - psubw m9, m7 - - paddw m4, m10, m6 - psubw m10, m6 - - paddw m7, m1, m2 - psubw m1, m2 - - ; stage 3 - paddw m6, m4, m5 - psubw m4, m5 - - pmulhrsw m1, m12 - pmulhrsw m7, m12 - - ; sin(pi / 8), cos(pi / 8) - punpcklwd m2, m10, m9 - punpckhwd m10, m9 - pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] - pmaddwd m2, [GLOBAL(pw_6270_m15137)] - pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] - pmaddwd m10, [GLOBAL(pw_6270_m15137)] - paddd m5, m8 - paddd m2, m8 - paddd m9, m8 - paddd m10, m8 - psrad m5, 14 - psrad m2, 14 - psrad m9, 14 - psrad m10, 14 - packssdw m5, m9 - packssdw m2, m10 - - pmulhrsw m6, m12 - pmulhrsw m4, m12 - - paddw m9, m3, m1 - psubw m3, m1 - - paddw m10, m0, m7 - psubw m0, m7 - - ; stage 4 - ; sin(pi / 16), cos(pi / 16) - punpcklwd m1, m10, m9 - punpckhwd m10, m9 - pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] - pmaddwd m1, [GLOBAL(pw_3196_m16069)] - pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] - pmaddwd m10, [GLOBAL(pw_3196_m16069)] - paddd m7, m8 - paddd m1, m8 - paddd m9, m8 - paddd m10, m8 - psrad m7, 14 - psrad m1, 14 - psrad m9, 14 - psrad m10, 14 - packssdw m7, m9 - packssdw m1, m10 - - ; sin(3 * pi / 16), cos(3 * pi / 16) - punpcklwd m11, m0, m3 - punpckhwd m0, m3 - pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] - pmaddwd m11, [GLOBAL(pw_13623_m9102)] - pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] - pmaddwd m0, [GLOBAL(pw_13623_m9102)] - paddd m9, m8 - paddd m11, m8 - paddd m3, m8 - paddd m0, m8 - psrad m9, 14 - psrad m11, 14 - psrad m3, 14 - psrad m0, 14 - packssdw m9, m3 - packssdw m11, m0 - - ; transpose - ; stage 1 - punpcklwd m0, m6, m7 - punpcklwd m3, m5, m11 - punpckhwd m6, m7 - punpckhwd m5, m11 - punpcklwd m7, m4, m9 - punpcklwd m10, m2, m1 - punpckhwd m4, m9 - punpckhwd m2, m1 - - ; stage 2 - punpckldq m9, m0, m3 - punpckldq m1, m6, m5 - punpckhdq m0, m3 - punpckhdq m6, m5 - punpckldq m3, m7, m10 - punpckldq m5, m4, m2 - punpckhdq m7, m10 - punpckhdq m4, m2 - - ; stage 3 - punpcklqdq m10, m9, m3 - punpckhqdq m9, m3 - punpcklqdq m2, m0, m7 - punpckhqdq m0, m7 - punpcklqdq m3, m1, m5 - punpckhqdq m1, m5 - punpcklqdq m7, m6, m4 - punpckhqdq m6, m4 - - ; row transform - ; stage 1 - paddw m5, m10, m6 - psubw m10, m6 - - paddw m4, m9, m7 - psubw m9, m7 - - paddw m6, m2, m1 - psubw m2, m1 - - paddw m7, m0, m3 - psubw m0, m3 - - ;stage 2 - paddw m1, m5, m7 - psubw m5, m7 - - paddw m3, m4, m6 - psubw m4, m6 - - paddw m7, m9, m2 - psubw m9, m2 - - ; stage 3 - punpcklwd m6, m1, m3 - punpckhwd m1, m3 - pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] - pmaddwd m6, [GLOBAL(pw_11585_m11585)] - pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] - pmaddwd m1, [GLOBAL(pw_11585_m11585)] - paddd m2, m8 - paddd m6, m8 - paddd m3, m8 - paddd m1, m8 - psrad m2, 14 - psrad m6, 14 - psrad m3, 14 - psrad m1, 14 - packssdw m2, m3 - packssdw m6, m1 - - pmulhrsw m7, m12 - pmulhrsw m9, m12 - - punpcklwd m3, m5, m4 - punpckhwd m5, m4 - pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] - pmaddwd m3, [GLOBAL(pw_6270_m15137)] - pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] - pmaddwd m5, [GLOBAL(pw_6270_m15137)] - paddd m1, m8 - paddd m3, m8 - paddd m4, m8 - paddd m5, m8 - psrad m1, 14 - psrad m3, 14 - psrad m4, 14 - psrad m5, 14 - packssdw m1, m4 - packssdw m3, m5 - - paddw m4, m0, m9 - psubw m0, m9 - - paddw m5, m10, m7 - psubw m10, m7 - - ; stage 4 - punpcklwd m9, m5, m4 - punpckhwd m5, m4 - pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] - pmaddwd m9, [GLOBAL(pw_3196_m16069)] - pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] - pmaddwd m5, [GLOBAL(pw_3196_m16069)] - paddd m7, m8 - paddd m9, m8 - paddd m4, m8 - paddd m5, m8 - psrad m7, 14 - psrad m9, 14 - psrad m4, 14 - psrad m5, 14 - packssdw m7, m4 - packssdw m9, m5 - - punpcklwd m4, m10, m0 - punpckhwd m10, m0 - pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] - pmaddwd m4, [GLOBAL(pw_13623_m9102)] - pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] - pmaddwd m10, [GLOBAL(pw_13623_m9102)] - paddd m5, m8 - paddd m4, m8 - paddd m0, m8 - paddd m10, m8 - psrad m5, 14 - psrad m4, 14 - psrad m0, 14 - psrad m10, 14 - packssdw m5, m0 - packssdw m4, m10 - - ; transpose - ; stage 1 - punpcklwd m0, m2, m7 - punpcklwd m10, m1, m4 - punpckhwd m2, m7 - punpckhwd m1, m4 - punpcklwd m7, m6, m5 - punpcklwd m4, m3, m9 - punpckhwd m6, m5 - punpckhwd m3, m9 - - ; stage 2 - punpckldq m5, m0, m10 - punpckldq m9, m2, m1 - punpckhdq m0, m10 - punpckhdq m2, m1 - punpckldq m10, m7, m4 - punpckldq m1, m6, m3 - punpckhdq m7, m4 - punpckhdq m6, m3 - - ; stage 3 - punpcklqdq m4, m5, m10 - punpckhqdq m5, m10 - punpcklqdq m3, m0, m7 - punpckhqdq m0, m7 - punpcklqdq m10, m9, m1 - punpckhqdq m9, m1 - punpcklqdq m7, m2, m6 - punpckhqdq m2, m6 - - psraw m1, m4, 15 - psraw m6, m5, 15 - psraw m8, m3, 15 - psraw m11, m0, 15 - - psubw m4, m1 - psubw m5, m6 - psubw m3, m8 - psubw m0, m11 - - psraw m4, 1 - psraw m5, 1 - psraw m3, 1 - psraw m0, 1 - - psraw m1, m10, 15 - psraw m6, m9, 15 - psraw m8, m7, 15 - psraw m11, m2, 15 - - psubw m10, m1 - psubw m9, m6 - psubw m7, m8 - psubw m2, m11 - - psraw m10, 1 - psraw m9, 1 - psraw m7, 1 - psraw m2, 1 - - STORE_OUTPUT 0, 4 - STORE_OUTPUT 8, 5 - STORE_OUTPUT 16, 3 - STORE_OUTPUT 24, 0 - STORE_OUTPUT 32, 10 - STORE_OUTPUT 40, 9 - STORE_OUTPUT 48, 7 - STORE_OUTPUT 56, 2 - - RET -%endif |