summaryrefslogtreecommitdiffstats
path: root/media/libjpeg/simd/jquanti-sse2.asm
blob: aea8604e2200ab70fe325f4d214e4fd67169b1d1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
;
; jquanti.asm - sample data conversion and quantization (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]

%include "jsimdext.inc"
%include "jdct.inc"

; --------------------------------------------------------------------------
        SECTION SEG_TEXT
        BITS    32
;
; Load data into workspace, applying unsigned->signed conversion
;
; GLOBAL(void)
; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
;                      DCTELEM *workspace);
;

%define sample_data     ebp+8           ; JSAMPARRAY sample_data
%define start_col       ebp+12          ; JDIMENSION start_col
%define workspace       ebp+16          ; DCTELEM *workspace

        align   16
        global  EXTN(jsimd_convsamp_sse2)

EXTN(jsimd_convsamp_sse2):
        push    ebp
        mov     ebp,esp
        push    ebx
;       push    ecx             ; need not be preserved
;       push    edx             ; need not be preserved
        push    esi
        push    edi

        pxor    xmm6,xmm6               ; xmm6=(all 0's)
        pcmpeqw xmm7,xmm7
        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}

        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
        mov     eax, JDIMENSION [start_col]
        mov     edi, POINTER [workspace]        ; (DCTELEM *)
        mov     ecx, DCTSIZE/4
        alignx  16,7
.convloop:
        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)

        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)

        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)

        movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
        movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)

        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
        paddw     xmm0,xmm7
        paddw     xmm1,xmm7
        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
        paddw     xmm2,xmm7
        paddw     xmm3,xmm7

        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3

        add     esi, byte 4*SIZEOF_JSAMPROW
        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
        dec     ecx
        jnz     short .convloop

        pop     edi
        pop     esi
;       pop     edx             ; need not be preserved
;       pop     ecx             ; need not be preserved
        pop     ebx
        pop     ebp
        ret

; --------------------------------------------------------------------------
;
; Quantize/descale the coefficients, and store into coef_block
;
; This implementation is based on an algorithm described in
;   "How to optimize for the Pentium family of microprocessors"
;   (http://www.agner.org/assem/).
;
; GLOBAL(void)
; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
;                      DCTELEM *workspace);
;

%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)

%define coef_block      ebp+8           ; JCOEFPTR coef_block
%define divisors        ebp+12          ; DCTELEM *divisors
%define workspace       ebp+16          ; DCTELEM *workspace

        align   16
        global  EXTN(jsimd_quantize_sse2)

EXTN(jsimd_quantize_sse2):
        push    ebp
        mov     ebp,esp
;       push    ebx             ; unused
;       push    ecx             ; unused
;       push    edx             ; need not be preserved
        push    esi
        push    edi

        mov     esi, POINTER [workspace]
        mov     edx, POINTER [divisors]
        mov     edi, JCOEFPTR [coef_block]
        mov     eax, DCTSIZE2/32
        alignx  16,7
.quantloop:
        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
        movdqa  xmm0,xmm4
        movdqa  xmm1,xmm5
        movdqa  xmm2,xmm6
        movdqa  xmm3,xmm7
        psraw   xmm4,(WORD_BIT-1)
        psraw   xmm5,(WORD_BIT-1)
        psraw   xmm6,(WORD_BIT-1)
        psraw   xmm7,(WORD_BIT-1)
        pxor    xmm0,xmm4
        pxor    xmm1,xmm5
        pxor    xmm2,xmm6
        pxor    xmm3,xmm7
        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;

        paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
        paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)]
        paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)]
        paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)]
        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
        pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale
        pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
        pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
        pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]

        pxor    xmm0,xmm4
        pxor    xmm1,xmm5
        pxor    xmm2,xmm6
        pxor    xmm3,xmm7
        psubw   xmm0,xmm4
        psubw   xmm1,xmm5
        psubw   xmm2,xmm6
        psubw   xmm3,xmm7
        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3

        add     esi, byte 32*SIZEOF_DCTELEM
        add     edx, byte 32*SIZEOF_DCTELEM
        add     edi, byte 32*SIZEOF_JCOEF
        dec     eax
        jnz     near .quantloop

        pop     edi
        pop     esi
;       pop     edx             ; need not be preserved
;       pop     ecx             ; unused
;       pop     ebx             ; unused
        pop     ebp
        ret

; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
        align   16