summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/aesni-intel_glue.c
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2020-12-31 17:41:54 +0100
committerHerbert Xu <herbert@gondor.apana.org.au>2021-01-08 15:39:47 +1100
commit86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 (patch)
treed0f396b57e398f50604e9e9fb20e793a02b9ccf0 /arch/x86/crypto/aesni-intel_glue.c
parentfecff3b931a52c8d5263fb1537161f0214acb44a (diff)
downloadlinux-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.tar.bz2
crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
The XTS asm helper arrangement is a bit odd: the 8-way stride helper consists of back-to-back calls to the 4-way core transforms, which are called indirectly, based on a boolean that indicates whether we are performing encryption or decryption. Given how costly indirect calls are on x86, let's switch to direct calls, and given how the 8-way stride doesn't really add anything substantial, use a 4-way stride instead, and make the asm core routine deal with any multiple of 4 blocks. Since 512 byte sectors or 4 KB blocks are the typical quantities XTS operates on, increase the stride exported to the glue helper to 512 bytes as well. As a result, the number of indirect calls is reduced from 3 per 64 bytes of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU) Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps") Tested-by: Eric Biggers <ebiggers@google.com> # x86_64 Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_glue.c')
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c25
1 files changed, 14 insertions, 11 deletions
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 96bdc1584215..84e3ed49b35d 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -101,6 +101,12 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
#ifdef CONFIG_X86_64
static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
@@ -108,9 +114,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, bool enc, le128 *iv);
-
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* struct gcm_context_data. May be uninitialized.
@@ -663,14 +666,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
}
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- aesni_xts_crypt8(ctx, dst, src, true, iv);
+ aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
}
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- aesni_xts_crypt8(ctx, dst, src, false, iv);
+ aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
}
static const struct common_glue_ctx aesni_enc_xts = {
@@ -678,8 +681,8 @@ static const struct common_glue_ctx aesni_enc_xts = {
.fpu_blocks_limit = 1,
.funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_enc8 }
+ .num_blocks = 32,
+ .fn_u = { .xts = aesni_xts_enc32 }
}, {
.num_blocks = 1,
.fn_u = { .xts = aesni_xts_enc }
@@ -691,8 +694,8 @@ static const struct common_glue_ctx aesni_dec_xts = {
.fpu_blocks_limit = 1,
.funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_dec8 }
+ .num_blocks = 32,
+ .fn_u = { .xts = aesni_xts_dec32 }
}, {
.num_blocks = 1,
.fn_u = { .xts = aesni_xts_dec }