summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/glue_helper-asm-avx.S
blob: d08fc575ef7f8e3364ac5ba62152289baa1ab132 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Shared glue code for 128bit block ciphers, AVX assembler macros
 *
 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 */

#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
	vmovdqu (0*16)(src), x0; \
	vmovdqu (1*16)(src), x1; \
	vmovdqu (2*16)(src), x2; \
	vmovdqu (3*16)(src), x3; \
	vmovdqu (4*16)(src), x4; \
	vmovdqu (5*16)(src), x5; \
	vmovdqu (6*16)(src), x6; \
	vmovdqu (7*16)(src), x7;

#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vmovdqu x0, (0*16)(dst); \
	vmovdqu x1, (1*16)(dst); \
	vmovdqu x2, (2*16)(dst); \
	vmovdqu x3, (3*16)(dst); \
	vmovdqu x4, (4*16)(dst); \
	vmovdqu x5, (5*16)(dst); \
	vmovdqu x6, (6*16)(dst); \
	vmovdqu x7, (7*16)(dst);

#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vpxor (0*16)(src), x1, x1; \
	vpxor (1*16)(src), x2, x2; \
	vpxor (2*16)(src), x3, x3; \
	vpxor (3*16)(src), x4, x4; \
	vpxor (4*16)(src), x5, x5; \
	vpxor (5*16)(src), x6, x6; \
	vpxor (6*16)(src), x7, x7; \
	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

#define inc_le128(x, minus_one, tmp) \
	vpcmpeqq minus_one, x, tmp; \
	vpsubq minus_one, x, x; \
	vpslldq $8, tmp, tmp; \
	vpsubq tmp, x, x;

#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
	vpcmpeqd t0, t0, t0; \
	vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
	vmovdqa bswap, t1; \
	\
	/* load IV and byteswap */ \
	vmovdqu (iv), x7; \
	vpshufb t1, x7, x0; \
	\
	/* construct IVs */ \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x1; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x2; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x3; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x4; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x5; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x6; \
	inc_le128(x7, t0, t2); \
	vmovdqa x7, t2; \
	vpshufb t1, x7, x7; \
	inc_le128(t2, t0, t1); \
	vmovdqu t2, (iv);

#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vpxor (0*16)(src), x0, x0; \
	vpxor (1*16)(src), x1, x1; \
	vpxor (2*16)(src), x2, x2; \
	vpxor (3*16)(src), x3, x3; \
	vpxor (4*16)(src), x4, x4; \
	vpxor (5*16)(src), x5, x5; \
	vpxor (6*16)(src), x6, x6; \
	vpxor (7*16)(src), x7, x7; \
	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

#define gf128mul_x_ble(iv, mask, tmp) \
	vpsrad $31, iv, tmp; \
	vpaddq iv, iv, iv; \
	vpshufd $0x13, tmp, tmp; \
	vpand mask, tmp, tmp; \
	vpxor tmp, iv, iv;

#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
		      t1, xts_gf128mul_and_shl1_mask) \
	vmovdqa xts_gf128mul_and_shl1_mask, t0; \
	\
	/* load IV */ \
	vmovdqu (iv), tiv; \
	vpxor (0*16)(src), tiv, x0; \
	vmovdqu tiv, (0*16)(dst); \
	\
	/* construct and store IVs, also xor with source */ \
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (1*16)(src), tiv, x1; \
	vmovdqu tiv, (1*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (2*16)(src), tiv, x2; \
	vmovdqu tiv, (2*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (3*16)(src), tiv, x3; \
	vmovdqu tiv, (3*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (4*16)(src), tiv, x4; \
	vmovdqu tiv, (4*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (5*16)(src), tiv, x5; \
	vmovdqu tiv, (5*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (6*16)(src), tiv, x6; \
	vmovdqu tiv, (6*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (7*16)(src), tiv, x7; \
	vmovdqu tiv, (7*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vmovdqu tiv, (iv);

#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vpxor (0*16)(dst), x0, x0; \
	vpxor (1*16)(dst), x1, x1; \
	vpxor (2*16)(dst), x2, x2; \
	vpxor (3*16)(dst), x3, x3; \
	vpxor (4*16)(dst), x4, x4; \
	vpxor (5*16)(dst), x5, x5; \
	vpxor (6*16)(dst), x6, x6; \
	vpxor (7*16)(dst), x7, x7; \
	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);