diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm')
-rw-r--r-- | third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm | 1481 |
1 files changed, 0 insertions, 1481 deletions
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm deleted file mode 100644 index 45bf6ec3c..000000000 --- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm +++ /dev/null @@ -1,1481 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 - -bilin_filter_m_ssse3: times 8 db 16, 0 - times 8 db 14, 2 - times 8 db 12, 4 - times 8 db 10, 6 - times 16 db 8 - times 8 db 6, 10 - times 8 db 4, 12 - times 8 db 2, 14 - -SECTION .text - -; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, -; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, -; int height, unsigned int *sse); -; -; This function returns the SE and stores SSE in the given pointer. - -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse - psubw %3, %4 - psubw %1, %2 - paddw %5, %3 - pmaddwd %3, %3 - paddw %5, %1 - pmaddwd %1, %1 - paddd %6, %3 - paddd %6, %1 -%endmacro - -%macro STORE_AND_RET 1 -%if %1 > 4 - ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit - ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. - ; We have to sign-extend it before adding the words within the register - ; and outputing to a dword. - pcmpgtw m5, m6 ; mask for 0 > x - movhlps m3, m7 - punpcklwd m4, m6, m5 - punpckhwd m6, m5 ; sign-extend m6 word->dword - paddd m7, m3 - paddd m6, m4 - pshufd m3, m7, 0x1 - movhlps m4, m6 - paddd m7, m3 - paddd m6, m4 - mov r1, ssem ; r1 = unsigned int *sse - pshufd m4, m6, 0x1 - movd [r1], m7 ; store sse - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%else ; 4xh - pshuflw m4, m6, 0xe - pshuflw m3, m7, 0xe - paddw m6, m4 - paddd m7, m3 - pcmpgtw m5, m6 ; mask for 0 > x - mov r1, ssem ; r1 = unsigned int *sse - punpcklwd m6, m5 ; sign-extend m6 word->dword - movd [r1], m7 ; store sse - pshuflw m4, m6, 0xe - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%endif - RET -%endmacro - -%macro INC_SRC_BY_SRC_STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 - add srcq, src_stridemp -%else - add srcq, src_strideq -%endif -%endmacro - -%macro SUBPEL_VARIANCE 1-2 0 ; W -%if cpuflag(ssse3) -%define bilin_filter_m bilin_filter_m_ssse3 -%define filter_idx_shift 4 -%else -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 -%endif -; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses -; 11, not 13, if the registers are ordered correctly. May make a minor speed -; difference on Win64 - -%if ARCH_X86_64 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq - %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %endif - %define block_height heightd - %define bilin_filter sseq -%else - %if CONFIG_PIC=1 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse, \ - g_bilin_filter, g_pw_8 - %define block_height dword heightm - %define sec_str sec_stridemp - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse, g_bilin_filter, g_pw_8 - %define block_height heightd - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %endif - %else - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, sec, sec_stride, \ - height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %define block_height heightd - %endif - %define bilin_filter bilin_filter_m - %endif -%endif - -%if %1 == 4 - %define movx movd -%else - %define movx movh -%endif - - ASSERT %1 <= 16 ; m6 overflows if w > 16 - pxor m6, m6 ; sum - pxor m7, m7 ; sse - ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we - ; could perhaps use it for something more productive then - pxor m5, m5 ; dedicated zero register -%if %1 < 16 - sar block_height, 1 -%if %2 == 1 ; avg - shl sec_str, 1 -%endif -%endif - - ; FIXME(rbultje) replace by jumptable? - test x_offsetd, x_offsetd - jnz .x_nonzero - ; x_offset == 0 - test y_offsetd, y_offsetd - jnz .x_zero_y_nonzero - - ; x_offset == 0 && y_offset == 0 -.x_zero_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - mova m1, [dstq] -%if %2 == 1 ; avg - pavgb m0, [secq] - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - -%if %2 == 0 ; !avg - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] -%endif - - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - -%if %2 == 1 ; avg -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_zero_loop - STORE_AND_RET %1 - -.x_zero_y_nonzero: - cmp y_offsetd, 4 - jne .x_zero_y_nonhalf - - ; x_offset == 0 && y_offset == 0.5 -.x_zero_y_half_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq*2] -%else ; 4xh - movx m1, [srcq+src_strideq*2] - punpckldq m2, m1 -%endif - movx m1, [dstq] -%if %1 > 4 - movlhps m0, m2 -%else ; 4xh - punpckldq m0, m2 -%endif - movx m3, [dstq+dst_strideq] - pavgb m0, m2 - punpcklbw m1, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m3, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m4, [secq] - pavgb m0, m4 - punpcklbw m3, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq*2] - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_half_loop - STORE_AND_RET %1 - -.x_zero_y_nonhalf: - ; x_offset == 0 && y_offset == bilin interpolation -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ; x86-32 or mmx -%if ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0, reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_zero_y_other_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can - ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of - ; instructions is the same (5), but it is 1 mul instead of 2, so might be - ; slightly faster because of pmullw latency. It would also cut our rodata - ; tables in half for this function, and save 1-2 registers on x86-64. - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m4, filter_y_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m4, filter_y_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonzero: - cmp x_offsetd, 4 - jne .x_nonhalf - ; x_offset == 0.5 - test y_offsetd, y_offsetd - jnz .x_half_y_nonzero - - ; x_offset == 0.5 && y_offset == 0 -.x_half_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m4, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] - movhps m4, [srcq+src_strideq+1] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 - movx m2, [srcq+src_strideq+1] - punpckldq m4, m2 -%endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - pavgb m0, m4 - punpcklbw m3, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m1, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] - movx m1, [dstq] - pavgb m0, m4 - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_zero_loop - STORE_AND_RET %1 - -.x_half_y_nonzero: - cmp y_offsetd, 4 - jne .x_half_y_nonhalf - - ; x_offset == 0.5 && y_offset == 0.5 -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - pavgb m4, m3 - punpckhbw m3, m1, m5 - pavgb m0, m4 -%if %2 == 1 ; avg - punpcklbw m1, m5 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movx m2, [srcq] - movx m3, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq] - movhps m3, [srcq+src_strideq+1] -%else - movx m1, [srcq+src_strideq] - punpckldq m2, m1 - movx m1, [srcq+src_strideq+1] - punpckldq m3, m1 -%endif - pavgb m2, m3 -%if %1 > 4 - movlhps m0, m2 - movhlps m4, m2 -%else ; 4xh - punpckldq m0, m2 - pshuflw m4, m2, 0xe -%endif - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq] - movx m1, [srcq+src_strideq+1] - pavgb m2, m3 - pavgb m4, m1 - pavgb m0, m2 - pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_half_loop - STORE_AND_RET %1 - -.x_half_y_nonhalf: - ; x_offset == 0.5 && y_offset == bilin interpolation -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ;x86_32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0.5. We can reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_other_loop: - movu m4, [srcq] - movu m2, [srcq+1] - mova m1, [dstq] - pavgb m4, m2 -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - punpcklbw m0, m5 - paddw m2, m3 - punpcklbw m3, m4, m5 - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 -%endif - punpckhbw m3, m1, m5 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -%if notcpuflag(ssse3) - punpcklbw m0, m5 -%endif -.x_half_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] - pavgb m2, m1 - pavgb m4, m3 - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - paddw m0, m1 - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m2, m1 - movx m1, [dstq] -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf: - test y_offsetd, y_offsetd - jnz .x_nonhalf_y_nonzero - - ; x_offset == bilin interpolation && y_offset == 0 -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -;y_offset == 0. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_other_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - pmullw m0, filter_x_a - pmullw m4, filter_x_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - punpcklbw m0, m1 - movx m1, [dstq] - punpcklbw m2, m4 - pmaddubsw m0, filter_x_a - pmaddubsw m2, filter_x_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_zero_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonzero: - cmp y_offsetd, 4 - jne .x_nonhalf_y_nonhalf - - ; x_offset == bilin interpolation && y_offset == 0.5 -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; y_offset == 0.5. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - add srcq, src_strideq - packuswb m0, m2 -.x_other_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] -%if cpuflag(ssse3) - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - pavgb m0, m4 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%else - punpckhbw m2, m4, m5 - punpckhbw m1, m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - paddw m4, m3 - paddw m2, m1 - mova m1, [dstq] - psraw m4, 4 - psraw m2, 4 - punpckhbw m3, m1, m5 - ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we - ; have a 1-register shortage to be able to store the backup of the bilin - ; filtered second line as words as cache for the next line. Packing into - ; a byte costs 1 pack and 2 unpacks, but saves a register. - packuswb m4, m2 - punpcklbw m1, m5 - pavgb m0, m4 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - pavgb m0, [secq] -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - add srcq, src_strideq - psraw m0, 4 -.x_other_y_half_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - paddw m2, filter_rnd - paddw m4, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - movx m1, [dstq] - paddw m4, m3 - movx m3, [dstq+dst_strideq] -%endif - psraw m2, 4 - psraw m4, 4 - pavgw m0, m2 - pavgw m2, m4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - also consider going to bytes here -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_half_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonhalf: -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m11, [bilin_filter+y_offsetq+16] -%endif - mova m12, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_y_a m10 -%define filter_y_b m11 -%define filter_rnd m12 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; In this case, there is NO unused register. Used src_stride register. Later, -; src_stride has to be loaded from stack when it is needed. -%define tempq src_strideq - mov tempq, g_bilin_filterm - add x_offsetq, tempq - add y_offsetq, tempq -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter - add y_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - - ; x_offset == bilin interpolation && y_offset == bilin interpolation -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - - INC_SRC_BY_SRC_STRIDE - - packuswb m0, m2 -.x_other_y_other_loop: -%if cpuflag(ssse3) - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - punpckhbw m3, m1, m5 - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - punpcklbw m1, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 - psraw m0, 4 -%else - movu m3, [srcq] - movu m4, [srcq+1] - punpckhbw m1, m3, m5 - punpckhbw m2, m4, m5 - punpcklbw m3, m5 - punpcklbw m4, m5 - pmullw m3, filter_x_a - pmullw m4, filter_x_b - paddw m3, filter_rnd - pmullw m1, filter_x_a - pmullw m2, filter_x_b - paddw m1, filter_rnd - paddw m3, m4 - paddw m1, m2 - psraw m3, 4 - psraw m1, 4 - packuswb m4, m3, m1 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - pmullw m2, filter_y_a - pmullw m1, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m2, m1 - mova m1, [dstq] - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 - punpckhbw m3, m1, m5 - psraw m0, 4 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - psraw m0, 4 -%if cpuflag(ssse3) - packuswb m0, m0 -%endif - - INC_SRC_BY_SRC_STRIDE - -.x_other_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - - INC_SRC_BY_SRC_STRIDE - movx m4, [srcq] - movx m3, [srcq+1] - -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m2, m2 - packuswb m4, m4 - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd - psraw m0, 4 - psraw m2, 4 - punpcklbw m1, m5 -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - paddw m4, m3 - psraw m2, 4 - psraw m4, 4 - pmullw m0, filter_y_a - pmullw m3, m2, filter_y_b - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m0, m3 - movx m3, [dstq+dst_strideq] - paddw m2, m1 - movx m1, [dstq] - psraw m0, 4 - psraw m2, 4 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_other_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd -%undef movx - STORE_AND_RET %1 -%endmacro - -; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical -; between the ssse3 and non-ssse3 version. It may make sense to merge their -; code in the sense that the ssse3 version would jump to the appropriate -; location in the sse/2 version, rather than duplicating that code in the -; binary. - -INIT_XMM sse2 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM sse2 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 |