summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/arcfour-amd64-gas.s
blob: 7c4f5358f1add1c369da40be33d502637c12ed03 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# ** ARCFOUR implementation optimized for AMD64.
# **
# ** The throughput achieved by this code is about 320 MBytes/sec, on
# ** a 1.8 GHz AMD Opteron (rev C0) processor.

.text
.align 16
.globl ARCFOUR
.type ARCFOUR,@function
ARCFOUR:
	pushq	%rbp
	pushq	%rbx
	movq	%rdi,		%rbp	# key = ARG(key)
	movq	%rsi,		%rbx	# rbx = ARG(len)
	movq	%rdx,		%rsi	# in = ARG(in)
	movq	%rcx,		%rdi	# out = ARG(out)
	movq	(%rbp),		%rcx	# x = key->x
	movq	8(%rbp),	%rdx	# y = key->y
	addq	$16,		%rbp	# d = key->data
	incq	%rcx			# x++
	andq	$255,		%rcx	# x &= 0xff
	leaq	-8(%rbx,%rsi),	%rbx	# rbx = in+len-8
	movq	%rbx,		%r9	# tmp = in+len-8
	movq	0(%rbp,%rcx,8),	%rax	# tx = d[x]
	cmpq	%rsi,		%rbx	# cmp in with in+len-8
	jl	.Lend			# jump if (in+len-8 < in)

.Lstart:
	addq	$8,		%rsi		# increment in
	addq	$8,		%rdi		# increment out

	# generate the next 8 bytes of the rc4 stream into %r8
	movq	$8,		%r11		# byte counter
1:	addb	%al,		%dl		# y += tx
	movl	0(%rbp,%rdx,8),	%ebx		# ty = d[y]
	movl	%ebx,		0(%rbp,%rcx,8)	# d[x] = ty
	addb	%al,		%bl		# val = ty + tx
	movl	%eax,		0(%rbp,%rdx,8)	# d[y] = tx
	incb	%cl				# x++		(NEXT ROUND)
	movl	0(%rbp,%rcx,8),	%eax		# tx = d[x]	(NEXT ROUND)
	movb	0(%rbp,%rbx,8),	%r8b		# val = d[val]
	decb	%r11b
	rorq	$8,		%r8		# (ror does not change ZF)
	jnz 	1b

	# xor 8 bytes
	xorq	-8(%rsi),	%r8
	cmpq	%r9,		%rsi		# cmp in+len-8 with in
	movq	%r8,		-8(%rdi)
	jle	.Lstart				# jump if (in <= in+len-8)

.Lend:
	addq	$8,		%r9		# tmp = in+len

	# handle the last bytes, one by one
1:	cmpq	%rsi,		%r9		# cmp in with in+len
	jle	.Lfinished			# jump if (in+len <= in)
	addb	%al,		%dl		# y += tx
	movl	0(%rbp,%rdx,8),	%ebx		# ty = d[y]
	movl	%ebx,		0(%rbp,%rcx,8)	# d[x] = ty
	addb	%al,		%bl		# val = ty + tx
	movl	%eax,		0(%rbp,%rdx,8)	# d[y] = tx
	incb	%cl				# x++		(NEXT ROUND)
	movl	0(%rbp,%rcx,8),	%eax		# tx = d[x]	(NEXT ROUND)
	movb	0(%rbp,%rbx,8),	%r8b		# val = d[val]
	xorb	(%rsi),		%r8b		# xor 1 byte
	movb	%r8b,		(%rdi)
	incq	%rsi				# in++
	incq	%rdi				# out++
	jmp 1b

.Lfinished:
	decq	%rcx				# x--
	movb	%dl,		-8(%rbp)	# key->y = y
	movb	%cl,		-16(%rbp)	# key->x = x
	popq	%rbx
	popq	%rbp
	ret
.L_ARCFOUR_end:
.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR

# Magic indicating no need for an executable stack
.section .note.GNU-stack,"",@progbits
.previous