summaryrefslogtreecommitdiffstats
path: root/media/libvpx/vp8/common/x86/copy_sse3.asm
blob: d789a40ccf7c58d294c58a8d901cd7e7c4683dcb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

%include "vpx_ports/x86_abi_support.asm"

%macro STACK_FRAME_CREATE_X3 0
%if ABI_IS_32BIT
  %define     src_ptr       rsi
  %define     src_stride    rax
  %define     ref_ptr       rdi
  %define     ref_stride    rdx
  %define     end_ptr       rcx
  %define     ret_var       rbx
  %define     result_ptr    arg(4)
  %define     max_sad       arg(4)
  %define     height        dword ptr arg(4)
    push        rbp
    mov         rbp,        rsp
    push        rsi
    push        rdi
    push        rbx

    mov         rsi,        arg(0)              ; src_ptr
    mov         rdi,        arg(2)              ; ref_ptr

    movsxd      rax,        dword ptr arg(1)    ; src_stride
    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
%else
  %if LIBVPX_YASM_WIN64
    SAVE_XMM 7, u
    %define     src_ptr     rcx
    %define     src_stride  rdx
    %define     ref_ptr     r8
    %define     ref_stride  r9
    %define     end_ptr     r10
    %define     ret_var     r11
    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
  %else
    %define     src_ptr     rdi
    %define     src_stride  rsi
    %define     ref_ptr     rdx
    %define     ref_stride  rcx
    %define     end_ptr     r9
    %define     ret_var     r10
    %define     result_ptr  r8
    %define     max_sad     r8
    %define     height      r8
  %endif
%endif

%endmacro

%macro STACK_FRAME_DESTROY_X3 0
  %define     src_ptr
  %define     src_stride
  %define     ref_ptr
  %define     ref_stride
  %define     end_ptr
  %define     ret_var
  %define     result_ptr
  %define     max_sad
  %define     height

%if ABI_IS_32BIT
    pop         rbx
    pop         rdi
    pop         rsi
    pop         rbp
%else
  %if LIBVPX_YASM_WIN64
    RESTORE_XMM
  %endif
%endif
    ret
%endmacro


;void vp8_copy32xn_sse3(
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *dst_ptr,
;    int  dst_stride,
;    int height);
global sym(vp8_copy32xn_sse3) PRIVATE
sym(vp8_copy32xn_sse3):

    STACK_FRAME_CREATE_X3

.block_copy_sse3_loopx4:
        lea             end_ptr,    [src_ptr+src_stride*2]

        movdqu          xmm0,       XMMWORD PTR [src_ptr]
        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
        movdqu          xmm4,       XMMWORD PTR [end_ptr]
        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]

        lea             src_ptr,    [src_ptr+src_stride*4]

        lea             end_ptr,    [ref_ptr+ref_stride*2]

        movdqa          XMMWORD PTR [ref_ptr], xmm0
        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
        movdqa          XMMWORD PTR [end_ptr], xmm4
        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7

        lea             ref_ptr,    [ref_ptr+ref_stride*4]

        sub             height,     4
        cmp             height,     4
        jge             .block_copy_sse3_loopx4

        ;Check to see if there is more rows need to be copied.
        cmp             height, 0
        je              .copy_is_done

.block_copy_sse3_loop:
        movdqu          xmm0,       XMMWORD PTR [src_ptr]
        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
        lea             src_ptr,    [src_ptr+src_stride]

        movdqa          XMMWORD PTR [ref_ptr], xmm0
        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
        lea             ref_ptr,    [ref_ptr+ref_stride]

        sub             height,     1
        jne             .block_copy_sse3_loop

.copy_is_done:
    STACK_FRAME_DESTROY_X3