summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2021-01-05 17:47:49 +0100
committerHerbert Xu <herbert@gondor.apana.org.au>2021-01-14 17:10:27 +1100
commit55a7e88f016873ef1717295d8460416b1ccd05a5 (patch)
tree055ecb3a017497622e6e94d8868f96410b34ba5d /arch/x86/crypto/camellia-aesni-avx2-asm_64.S
parent4d6a5a4b1e4a7606bf666ce694671f6897bdabaa (diff)
downloadlinux-55a7e88f016873ef1717295d8460416b1ccd05a5.tar.bz2
crypto: x86/camellia - switch to XTS template
Now that the XTS template can wrap accelerated ECB modes, it can be used to implement Camellia in XTS mode as well, which turns out to be at least as fast, and sometimes even faster. Acked-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/camellia-aesni-avx2-asm_64.S')
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S207
1 files changed, 0 insertions, 207 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0907243c501c..9561dee52de0 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -629,12 +628,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-/* For XTS mode */
-.Lxts_gf128mul_and_shl1_mask_0:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-.Lxts_gf128mul_and_shl1_mask_1:
- .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
/*
* pre-SubByte transform
*
@@ -1201,203 +1194,3 @@ SYM_FUNC_START(camellia_ctr_32way)
FRAME_END
ret;
SYM_FUNC_END(camellia_ctr_32way)
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
- vpsrad $31, iv, tmp0; \
- vpaddq iv, iv, tmp1; \
- vpsllq $2, iv, iv; \
- vpshufd $0x13, tmp0, tmp0; \
- vpsrad $31, tmp1, tmp1; \
- vpand mask2, tmp0, tmp0; \
- vpshufd $0x13, tmp1, tmp1; \
- vpxor tmp0, iv, iv; \
- vpand mask1, tmp1, tmp1; \
- vpxor tmp1, iv, iv;
-
-.align 8
-SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- * %r8: index for input whitening key
- * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- subq $(16 * 32), %rsp;
- movq %rsp, %rax;
-
- vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
-
- /* load IV and construct second IV */
- vmovdqu (%rcx), %xmm0;
- vmovdqa %xmm0, %xmm15;
- gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
- vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
- vinserti128 $1, %xmm0, %ymm15, %ymm0;
- vpxor 0 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 15 * 32(%rax);
- vmovdqu %ymm0, 0 * 32(%rsi);
-
- /* construct IVs */
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 1 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 14 * 32(%rax);
- vmovdqu %ymm0, 1 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 2 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 13 * 32(%rax);
- vmovdqu %ymm0, 2 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 3 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 12 * 32(%rax);
- vmovdqu %ymm0, 3 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 4 * 32(%rdx), %ymm0, %ymm11;
- vmovdqu %ymm0, 4 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 5 * 32(%rdx), %ymm0, %ymm10;
- vmovdqu %ymm0, 5 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 6 * 32(%rdx), %ymm0, %ymm9;
- vmovdqu %ymm0, 6 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 7 * 32(%rdx), %ymm0, %ymm8;
- vmovdqu %ymm0, 7 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 8 * 32(%rdx), %ymm0, %ymm7;
- vmovdqu %ymm0, 8 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 9 * 32(%rdx), %ymm0, %ymm6;
- vmovdqu %ymm0, 9 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 10 * 32(%rdx), %ymm0, %ymm5;
- vmovdqu %ymm0, 10 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 11 * 32(%rdx), %ymm0, %ymm4;
- vmovdqu %ymm0, 11 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 12 * 32(%rdx), %ymm0, %ymm3;
- vmovdqu %ymm0, 12 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 13 * 32(%rdx), %ymm0, %ymm2;
- vmovdqu %ymm0, 13 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 14 * 32(%rdx), %ymm0, %ymm1;
- vmovdqu %ymm0, 14 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 15 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 0 * 32(%rax);
- vmovdqu %ymm0, 15 * 32(%rsi);
-
- vextracti128 $1, %ymm0, %xmm0;
- gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
- vmovdqu %xmm0, (%rcx);
-
- /* inpack32_pre: */
- vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
- vpshufb .Lpack_bswap, %ymm15, %ymm15;
- vpxor 0 * 32(%rax), %ymm15, %ymm0;
- vpxor %ymm1, %ymm15, %ymm1;
- vpxor %ymm2, %ymm15, %ymm2;
- vpxor %ymm3, %ymm15, %ymm3;
- vpxor %ymm4, %ymm15, %ymm4;
- vpxor %ymm5, %ymm15, %ymm5;
- vpxor %ymm6, %ymm15, %ymm6;
- vpxor %ymm7, %ymm15, %ymm7;
- vpxor %ymm8, %ymm15, %ymm8;
- vpxor %ymm9, %ymm15, %ymm9;
- vpxor %ymm10, %ymm15, %ymm10;
- vpxor %ymm11, %ymm15, %ymm11;
- vpxor 12 * 32(%rax), %ymm15, %ymm12;
- vpxor 13 * 32(%rax), %ymm15, %ymm13;
- vpxor 14 * 32(%rax), %ymm15, %ymm14;
- vpxor 15 * 32(%rax), %ymm15, %ymm15;
-
- CALL_NOSPEC r9;
-
- addq $(16 * 32), %rsp;
-
- vpxor 0 * 32(%rsi), %ymm7, %ymm7;
- vpxor 1 * 32(%rsi), %ymm6, %ymm6;
- vpxor 2 * 32(%rsi), %ymm5, %ymm5;
- vpxor 3 * 32(%rsi), %ymm4, %ymm4;
- vpxor 4 * 32(%rsi), %ymm3, %ymm3;
- vpxor 5 * 32(%rsi), %ymm2, %ymm2;
- vpxor 6 * 32(%rsi), %ymm1, %ymm1;
- vpxor 7 * 32(%rsi), %ymm0, %ymm0;
- vpxor 8 * 32(%rsi), %ymm15, %ymm15;
- vpxor 9 * 32(%rsi), %ymm14, %ymm14;
- vpxor 10 * 32(%rsi), %ymm13, %ymm13;
- vpxor 11 * 32(%rsi), %ymm12, %ymm12;
- vpxor 12 * 32(%rsi), %ymm11, %ymm11;
- vpxor 13 * 32(%rsi), %ymm10, %ymm10;
- vpxor 14 * 32(%rsi), %ymm9, %ymm9;
- vpxor 15 * 32(%rsi), %ymm8, %ymm8;
- write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
- %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
- %ymm8, %rsi);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_xts_crypt_32way)
-
-SYM_FUNC_START(camellia_xts_enc_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- xorl %r8d, %r8d; /* input whitening key, 0 for enc */
-
- leaq __camellia_enc_blk32, %r9;
-
- jmp camellia_xts_crypt_32way;
-SYM_FUNC_END(camellia_xts_enc_32way)
-
-SYM_FUNC_START(camellia_xts_dec_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- cmpl $16, key_length(CTX);
- movl $32, %r8d;
- movl $24, %eax;
- cmovel %eax, %r8d; /* input whitening key, last for dec */
-
- leaq __camellia_dec_blk32, %r9;
-
- jmp camellia_xts_crypt_32way;
-SYM_FUNC_END(camellia_xts_dec_32way)