From 9c1e8836edbbaf3656bc07437b59c04be034ac4e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Nov 2019 22:08:02 -0800 Subject: crypto: x86 - Regularize glue function prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The crypto glue performed function prototype casting via macros to make indirect calls to assembly routines. Instead of performing casts at the call sites (which trips Control Flow Integrity prototype checking), switch each prototype to a common standard set of arguments which allows the removal of the existing macros. In order to keep pointer math unchanged, internal casting between u128 pointers and u8 pointers is added. Co-developed-by: João Moreira Signed-off-by: João Moreira Signed-off-by: Kees Cook Reviewed-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 8 ++-- arch/x86/crypto/aesni-intel_glue.c | 45 +++++++----------- arch/x86/crypto/camellia_aesni_avx2_glue.c | 74 ++++++++++++++--------------- arch/x86/crypto/camellia_aesni_avx_glue.c | 72 +++++++++++++--------------- arch/x86/crypto/camellia_glue.c | 45 +++++++++--------- arch/x86/crypto/cast6_avx_glue.c | 68 ++++++++++++--------------- arch/x86/crypto/glue_helper.c | 23 +++++---- arch/x86/crypto/serpent_avx2_glue.c | 65 ++++++++++++-------------- arch/x86/crypto/serpent_avx_glue.c | 63 ++++++++++++------------- arch/x86/crypto/serpent_sse2_glue.c | 30 +++++++----- arch/x86/crypto/twofish_avx_glue.c | 75 +++++++++++++----------------- arch/x86/crypto/twofish_glue_3way.c | 37 ++++++++------- arch/x86/include/asm/crypto/camellia.h | 63 ++++++++++++------------- arch/x86/include/asm/crypto/glue_helper.h | 18 +++---- arch/x86/include/asm/crypto/serpent-avx.h | 20 ++++---- arch/x86/include/asm/crypto/serpent-sse2.h | 28 +++++------ arch/x86/include/asm/crypto/twofish.h | 19 ++++---- 17 files changed, 356 insertions(+), 397 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index d28503f99f58..cad6e1bfa7d5 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1942,7 +1942,7 @@ SYM_FUNC_START(aesni_set_key) SYM_FUNC_END(aesni_set_key) /* - * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) */ SYM_FUNC_START(aesni_enc) FRAME_BEGIN @@ -2131,7 +2131,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc4) SYM_FUNC_END(_aesni_enc4) /* - * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) */ SYM_FUNC_START(aesni_dec) FRAME_BEGIN @@ -2716,8 +2716,8 @@ SYM_FUNC_END(aesni_ctr_enc) pxor CTR, IV; /* - * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, - * bool enc, u8 *iv) + * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, bool enc, le128 *iv) */ SYM_FUNC_START(aesni_xts_crypt8) FRAME_BEGIN diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 3e707e81afdb..670f8fcf2544 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -83,10 +83,8 @@ struct gcm_context_data { asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); -asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in); -asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in); +asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); +asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len); asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out, @@ -106,8 +104,8 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, bool enc, u8 *iv); +asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, bool enc, le128 *iv); /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. @@ -550,29 +548,24 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, } -static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) +static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - aesni_enc(ctx, out, in); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc); } -static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); } -static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); + aesni_xts_crypt8(ctx, dst, src, true, iv); } -static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); -} - -static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) -{ - aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); + aesni_xts_crypt8(ctx, dst, src, false, iv); } static const struct common_glue_ctx aesni_enc_xts = { @@ -581,10 +574,10 @@ static const struct common_glue_ctx aesni_enc_xts = { .funcs = { { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } + .fn_u = { .xts = aesni_xts_enc8 } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } + .fn_u = { .xts = aesni_xts_enc } } } }; @@ -594,10 +587,10 @@ static const struct common_glue_ctx aesni_dec_xts = { .funcs = { { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } + .fn_u = { .xts = aesni_xts_dec8 } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } + .fn_u = { .xts = aesni_xts_dec } } } }; @@ -606,8 +599,7 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&aesni_enc_xts, req, - XTS_TWEAK_CAST(aesni_xts_tweak), + return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc, aes_ctx(ctx->raw_tweak_ctx), aes_ctx(ctx->raw_crypt_ctx), false); @@ -618,8 +610,7 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&aesni_dec_xts, req, - XTS_TWEAK_CAST(aesni_xts_tweak), + return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc, aes_ctx(ctx->raw_tweak_ctx), aes_ctx(ctx->raw_crypt_ctx), true); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index a4f00128ea55..a8cc2c83fe1b 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -19,20 +19,17 @@ #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 /* 32-way AVX2/AES-NI parallel cipher functions */ -asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); -asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static const struct common_glue_ctx camellia_enc = { .num_funcs = 4, @@ -40,16 +37,16 @@ static const struct common_glue_ctx camellia_enc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } + .fn_u = { .ecb = camellia_ecb_enc_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } + .fn_u = { .ecb = camellia_ecb_enc_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + .fn_u = { .ecb = camellia_enc_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + .fn_u = { .ecb = camellia_enc_blk } } } }; @@ -59,16 +56,16 @@ static const struct common_glue_ctx camellia_ctr = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } + .fn_u = { .ctr = camellia_ctr_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } + .fn_u = { .ctr = camellia_ctr_16way } }, { .num_blocks = 2, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + .fn_u = { .ctr = camellia_crypt_ctr_2way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + .fn_u = { .ctr = camellia_crypt_ctr } } } }; @@ -78,13 +75,13 @@ static const struct common_glue_ctx camellia_enc_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } + .fn_u = { .xts = camellia_xts_enc_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } + .fn_u = { .xts = camellia_xts_enc_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } + .fn_u = { .xts = camellia_xts_enc } } } }; @@ -94,16 +91,16 @@ static const struct common_glue_ctx camellia_dec = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } + .fn_u = { .ecb = camellia_ecb_dec_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } + .fn_u = { .ecb = camellia_ecb_dec_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + .fn_u = { .ecb = camellia_dec_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .ecb = camellia_dec_blk } } } }; @@ -113,16 +110,16 @@ static const struct common_glue_ctx camellia_dec_cbc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } + .fn_u = { .cbc = camellia_cbc_dec_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } + .fn_u = { .cbc = camellia_cbc_dec_16way } }, { .num_blocks = 2, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + .fn_u = { .cbc = camellia_decrypt_cbc_2way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .cbc = camellia_dec_blk } } } }; @@ -132,13 +129,13 @@ static const struct common_glue_ctx camellia_dec_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } + .fn_u = { .xts = camellia_xts_dec_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } + .fn_u = { .xts = camellia_xts_dec_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } + .fn_u = { .xts = camellia_xts_dec } } } }; @@ -161,8 +158,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -180,8 +176,7 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_enc_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), + return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, &ctx->tweak_ctx, &ctx->crypt_ctx, false); } @@ -190,8 +185,7 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_dec_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), + return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, &ctx->tweak_ctx, &ctx->crypt_ctx, true); } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index f28d282779b8..31a82a79f4ac 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -18,41 +18,36 @@ #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 /* 16-way parallel cipher functions (avx/aes-ni) */ -asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); -asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); -asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); -asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(camellia_ctr_16way); -asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); -asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); -void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(camellia_enc_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk); } EXPORT_SYMBOL_GPL(camellia_xts_enc); -void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(camellia_dec_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk); } EXPORT_SYMBOL_GPL(camellia_xts_dec); @@ -62,13 +57,13 @@ static const struct common_glue_ctx camellia_enc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } + .fn_u = { .ecb = camellia_ecb_enc_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + .fn_u = { .ecb = camellia_enc_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + .fn_u = { .ecb = camellia_enc_blk } } } }; @@ -78,13 +73,13 @@ static const struct common_glue_ctx camellia_ctr = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } + .fn_u = { .ctr = camellia_ctr_16way } }, { .num_blocks = 2, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + .fn_u = { .ctr = camellia_crypt_ctr_2way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + .fn_u = { .ctr = camellia_crypt_ctr } } } }; @@ -94,10 +89,10 @@ static const struct common_glue_ctx camellia_enc_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } + .fn_u = { .xts = camellia_xts_enc_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } + .fn_u = { .xts = camellia_xts_enc } } } }; @@ -107,13 +102,13 @@ static const struct common_glue_ctx camellia_dec = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } + .fn_u = { .ecb = camellia_ecb_dec_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + .fn_u = { .ecb = camellia_dec_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .ecb = camellia_dec_blk } } } }; @@ -123,13 +118,13 @@ static const struct common_glue_ctx camellia_dec_cbc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } + .fn_u = { .cbc = camellia_cbc_dec_16way } }, { .num_blocks = 2, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + .fn_u = { .cbc = camellia_decrypt_cbc_2way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .cbc = camellia_dec_blk } } } }; @@ -139,10 +134,10 @@ static const struct common_glue_ctx camellia_dec_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } + .fn_u = { .xts = camellia_xts_dec_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } + .fn_u = { .xts = camellia_xts_dec } } } }; @@ -165,8 +160,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -206,8 +200,7 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_enc_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), + return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, &ctx->tweak_ctx, &ctx->crypt_ctx, false); } @@ -216,8 +209,7 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_dec_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), + return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, &ctx->tweak_ctx, &ctx->crypt_ctx, true); } diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 7c62db56ffe1..5f3ed5af68d7 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -18,19 +18,17 @@ #include /* regular block cipher functions */ -asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, bool xor); +asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, + bool xor); EXPORT_SYMBOL_GPL(__camellia_enc_blk); -asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_dec_blk); /* 2-way parallel cipher functions */ -asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, bool xor); +asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, + bool xor); EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way); -asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_dec_blk_2way); static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) @@ -1267,8 +1265,10 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return camellia_setkey(&tfm->base, key, key_len); } -void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) +void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s) { + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; u128 iv = *src; camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); @@ -1277,9 +1277,11 @@ void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) } EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) *dst = *src; @@ -1291,9 +1293,11 @@ void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(camellia_crypt_ctr); -void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblks[2]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) { dst[0] = src[0]; @@ -1315,10 +1319,10 @@ static const struct common_glue_ctx camellia_enc = { .funcs = { { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + .fn_u = { .ecb = camellia_enc_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + .fn_u = { .ecb = camellia_enc_blk } } } }; @@ -1328,10 +1332,10 @@ static const struct common_glue_ctx camellia_ctr = { .funcs = { { .num_blocks = 2, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + .fn_u = { .ctr = camellia_crypt_ctr_2way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + .fn_u = { .ctr = camellia_crypt_ctr } } } }; @@ -1341,10 +1345,10 @@ static const struct common_glue_ctx camellia_dec = { .funcs = { { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + .fn_u = { .ecb = camellia_dec_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .ecb = camellia_dec_blk } } } }; @@ -1354,10 +1358,10 @@ static const struct common_glue_ctx camellia_dec_cbc = { .funcs = { { .num_blocks = 2, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + .fn_u = { .cbc = camellia_decrypt_cbc_2way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .cbc = camellia_dec_blk } } } }; @@ -1373,8 +1377,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index a8a38fffb4a9..da5297475f9e 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -20,20 +20,17 @@ #define CAST6_PARALLEL_BLOCKS 8 -asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src); - -asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, +asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); + +asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -41,21 +38,21 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, return cast6_setkey(&tfm->base, key, keylen); } -static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__cast6_encrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt); } -static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__cast6_decrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt); } -static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; le128_to_be128(&ctrblk, iv); le128_inc(iv); @@ -70,10 +67,10 @@ static const struct common_glue_ctx cast6_enc = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) } + .fn_u = { .ecb = cast6_ecb_enc_8way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } + .fn_u = { .ecb = __cast6_encrypt } } } }; @@ -83,10 +80,10 @@ static const struct common_glue_ctx cast6_ctr = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) } + .fn_u = { .ctr = cast6_ctr_8way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } + .fn_u = { .ctr = cast6_crypt_ctr } } } }; @@ -96,10 +93,10 @@ static const struct common_glue_ctx cast6_enc_xts = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) } + .fn_u = { .xts = cast6_xts_enc_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) } + .fn_u = { .xts = cast6_xts_enc } } } }; @@ -109,10 +106,10 @@ static const struct common_glue_ctx cast6_dec = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) } + .fn_u = { .ecb = cast6_ecb_dec_8way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } + .fn_u = { .ecb = __cast6_decrypt } } } }; @@ -122,10 +119,10 @@ static const struct common_glue_ctx cast6_dec_cbc = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) } + .fn_u = { .cbc = cast6_cbc_dec_8way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } + .fn_u = { .cbc = __cast6_decrypt } } } }; @@ -135,10 +132,10 @@ static const struct common_glue_ctx cast6_dec_xts = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) } + .fn_u = { .xts = cast6_xts_dec_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) } + .fn_u = { .xts = cast6_xts_dec } } } }; @@ -154,8 +151,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__cast6_encrypt), - req); + return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -199,8 +195,7 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&cast6_enc_xts, req, - XTS_TWEAK_CAST(__cast6_encrypt), + return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt, &ctx->tweak_ctx, &ctx->crypt_ctx, false); } @@ -209,8 +204,7 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&cast6_dec_xts, req, - XTS_TWEAK_CAST(__cast6_encrypt), + return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt, &ctx->tweak_ctx, &ctx->crypt_ctx, true); } diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index d15b99397480..d3d91a0abf88 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -134,7 +134,8 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, src -= num_blocks - 1; dst -= num_blocks - 1; - gctx->funcs[i].fn_u.cbc(ctx, dst, src); + gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst, + (const u8 *)src); nbytes -= func_bytes; if (nbytes < bsize) @@ -188,7 +189,9 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, /* Process multi-block batch */ do { - gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); + gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst, + (const u8 *)src, + &ctrblk); src += num_blocks; dst += num_blocks; nbytes -= func_bytes; @@ -210,7 +213,8 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, be128_to_le128(&ctrblk, (be128 *)walk.iv); memcpy(&tmp, walk.src.virt.addr, nbytes); - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, &tmp, &tmp, + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp, + (const u8 *)&tmp, &ctrblk); memcpy(walk.dst.virt.addr, &tmp, nbytes); le128_to_be128((be128 *)walk.iv, &ctrblk); @@ -240,7 +244,8 @@ static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, if (nbytes >= func_bytes) { do { - gctx->funcs[i].fn_u.xts(ctx, dst, src, + gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst, + (const u8 *)src, walk->iv); src += num_blocks; @@ -354,8 +359,8 @@ out: } EXPORT_SYMBOL_GPL(glue_xts_req_128bit); -void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, - common_glue_func_t fn) +void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src, + le128 *iv, common_glue_func_t fn) { le128 ivblk = *iv; @@ -363,13 +368,13 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, gf128mul_x_ble(iv, &ivblk); /* CC <- T xor C */ - u128_xor(dst, src, (u128 *)&ivblk); + u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk); /* PP <- D(Key2,CC) */ - fn(ctx, (u8 *)dst, (u8 *)dst); + fn(ctx, dst, dst); /* P <- T xor PP */ - u128_xor(dst, dst, (u128 *)&ivblk); + u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk); } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 13fd8d3d2da0..f973ace44ad3 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -19,18 +19,16 @@ #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ -asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); +asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src, +asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -44,13 +42,13 @@ static const struct common_glue_ctx serpent_enc = { .funcs = { { .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) } + .fn_u = { .ecb = serpent_ecb_enc_16way } }, { .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } + .fn_u = { .ecb = serpent_ecb_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + .fn_u = { .ecb = __serpent_encrypt } } } }; @@ -60,13 +58,13 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = 16, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) } + .fn_u = { .ctr = serpent_ctr_16way } }, { .num_blocks = 8, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } + .fn_u = { .ctr = serpent_ctr_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } + .fn_u = { .ctr = __serpent_crypt_ctr } } } }; @@ -76,13 +74,13 @@ static const struct common_glue_ctx serpent_enc_xts = { .funcs = { { .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) } + .fn_u = { .xts = serpent_xts_enc_16way } }, { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } + .fn_u = { .xts = serpent_xts_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } + .fn_u = { .xts = serpent_xts_enc } } } }; @@ -92,13 +90,13 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) } + .fn_u = { .ecb = serpent_ecb_dec_16way } }, { .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } + .fn_u = { .ecb = serpent_ecb_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .ecb = __serpent_decrypt } } } }; @@ -108,13 +106,13 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = 16, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) } + .fn_u = { .cbc = serpent_cbc_dec_16way } }, { .num_blocks = 8, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } + .fn_u = { .cbc = serpent_cbc_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .cbc = __serpent_decrypt } } } }; @@ -124,13 +122,13 @@ static const struct common_glue_ctx serpent_dec_xts = { .funcs = { { .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) } + .fn_u = { .xts = serpent_xts_dec_16way } }, { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } + .fn_u = { .xts = serpent_xts_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } + .fn_u = { .xts = serpent_xts_dec } } } }; @@ -146,8 +144,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), - req); + return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -166,8 +163,8 @@ static int xts_encrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_enc_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx, false); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -176,8 +173,8 @@ static int xts_decrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_dec_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, true); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7d3dca38a5a2..7806d1cbe854 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -20,33 +20,35 @@ #include /* 8-way parallel cipher functions */ -asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx); -asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx); -asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); -asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); -asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); -asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); -void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; le128_to_be128(&ctrblk, iv); le128_inc(iv); @@ -56,17 +58,15 @@ void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); -void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__serpent_encrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt); } EXPORT_SYMBOL_GPL(serpent_xts_enc); -void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__serpent_decrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt); } EXPORT_SYMBOL_GPL(serpent_xts_dec); @@ -102,10 +102,10 @@ static const struct common_glue_ctx serpent_enc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } + .fn_u = { .ecb = serpent_ecb_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + .fn_u = { .ecb = __serpent_encrypt } } } }; @@ -115,10 +115,10 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } + .fn_u = { .ctr = serpent_ctr_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } + .fn_u = { .ctr = __serpent_crypt_ctr } } } }; @@ -128,10 +128,10 @@ static const struct common_glue_ctx serpent_enc_xts = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } + .fn_u = { .xts = serpent_xts_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } + .fn_u = { .xts = serpent_xts_enc } } } }; @@ -141,10 +141,10 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } + .fn_u = { .ecb = serpent_ecb_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .ecb = __serpent_decrypt } } } }; @@ -154,10 +154,10 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } + .fn_u = { .cbc = serpent_cbc_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .cbc = __serpent_decrypt } } } }; @@ -167,10 +167,10 @@ static const struct common_glue_ctx serpent_dec_xts = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } + .fn_u = { .xts = serpent_xts_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } + .fn_u = { .xts = serpent_xts_dec } } } }; @@ -186,8 +186,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), - req); + return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -206,8 +205,8 @@ static int xts_encrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_enc_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx, false); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -216,8 +215,8 @@ static int xts_decrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_dec_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, true); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 5fdf1931d069..4fed8d26b91a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -31,9 +31,11 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s) { u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; unsigned int j; for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) @@ -45,9 +47,11 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; le128_to_be128(&ctrblk, iv); le128_inc(iv); @@ -56,10 +60,12 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) u128_xor(dst, src, (u128 *)&ctrblk); } -static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, +static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; unsigned int i; for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { @@ -79,10 +85,10 @@ static const struct common_glue_ctx serpent_enc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + .fn_u = { .ecb = serpent_enc_blk_xway } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + .fn_u = { .ecb = __serpent_encrypt } } } }; @@ -92,10 +98,10 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + .fn_u = { .ctr = serpent_crypt_ctr_xway } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + .fn_u = { .ctr = serpent_crypt_ctr } } } }; @@ -105,10 +111,10 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + .fn_u = { .ecb = serpent_dec_blk_xway } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .ecb = __serpent_decrypt } } } }; @@ -118,10 +124,10 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + .fn_u = { .cbc = serpent_decrypt_cbc_xway } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .cbc = __serpent_decrypt } } } }; @@ -137,7 +143,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), + return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); } diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index d561c821788b..3b36e97ec7ab 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -22,20 +22,17 @@ #define TWOFISH_PARALLEL_BLOCKS 8 /* 8-way parallel cipher functions */ -asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); -asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -43,22 +40,19 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, return twofish_setkey(&tfm->base, key, keylen); } -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) +static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, false); } -static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(twofish_enc_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk); } -static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(twofish_dec_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk); } struct twofish_xts_ctx { @@ -93,13 +87,13 @@ static const struct common_glue_ctx twofish_enc = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } + .fn_u = { .ecb = twofish_ecb_enc_8way } }, { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + .fn_u = { .ecb = twofish_enc_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + .fn_u = { .ecb = twofish_enc_blk } } } }; @@ -109,13 +103,13 @@ static const struct common_glue_ctx twofish_ctr = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } + .fn_u = { .ctr = twofish_ctr_8way } }, { .num_blocks = 3, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } + .fn_u = { .ctr = twofish_enc_blk_ctr_3way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } + .fn_u = { .ctr = twofish_enc_blk_ctr } } } }; @@ -125,10 +119,10 @@ static const struct common_glue_ctx twofish_enc_xts = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } + .fn_u = { .xts = twofish_xts_enc_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } + .fn_u = { .xts = twofish_xts_enc } } } }; @@ -138,13 +132,13 @@ static const struct common_glue_ctx twofish_dec = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } + .fn_u = { .ecb = twofish_ecb_dec_8way } }, { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + .fn_u = { .ecb = twofish_dec_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .ecb = twofish_dec_blk } } } }; @@ -154,13 +148,13 @@ static const struct common_glue_ctx twofish_dec_cbc = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } + .fn_u = { .cbc = twofish_cbc_dec_8way } }, { .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + .fn_u = { .cbc = twofish_dec_blk_cbc_3way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .cbc = twofish_dec_blk } } } }; @@ -170,10 +164,10 @@ static const struct common_glue_ctx twofish_dec_xts = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } + .fn_u = { .xts = twofish_xts_dec_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } + .fn_u = { .xts = twofish_xts_dec } } } }; @@ -189,8 +183,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -208,8 +201,7 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&twofish_enc_xts, req, - XTS_TWEAK_CAST(twofish_enc_blk), + return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk, &ctx->tweak_ctx, &ctx->crypt_ctx, false); } @@ -218,8 +210,7 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&twofish_dec_xts, req, - XTS_TWEAK_CAST(twofish_enc_blk), + return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk, &ctx->tweak_ctx, &ctx->crypt_ctx, true); } diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 1dc9e29f221e..768af6075479 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -25,21 +25,22 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, return twofish_setkey(&tfm->base, key, keylen); } -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) +static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, false); } -static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, +static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, true); } -void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) +void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) { u128 ivs[2]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; ivs[0] = src[0]; ivs[1] = src[1]; @@ -51,9 +52,11 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) *dst = *src; @@ -66,10 +69,11 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); -void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, - le128 *iv) +void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblks[3]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) { dst[0] = src[0]; @@ -94,10 +98,10 @@ static const struct common_glue_ctx twofish_enc = { .funcs = { { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + .fn_u = { .ecb = twofish_enc_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + .fn_u = { .ecb = twofish_enc_blk } } } }; @@ -107,10 +111,10 @@ static const struct common_glue_ctx twofish_ctr = { .funcs = { { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) } + .fn_u = { .ctr = twofish_enc_blk_ctr_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) } + .fn_u = { .ctr = twofish_enc_blk_ctr } } } }; @@ -120,10 +124,10 @@ static const struct common_glue_ctx twofish_dec = { .funcs = { { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + .fn_u = { .ecb = twofish_dec_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .ecb = twofish_dec_blk } } } }; @@ -133,10 +137,10 @@ static const struct common_glue_ctx twofish_dec_cbc = { .funcs = { { .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + .fn_u = { .cbc = twofish_dec_blk_cbc_3way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .cbc = twofish_dec_blk } } } }; @@ -152,8 +156,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index a5d86fc0593f..f1592619dd65 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -32,65 +32,60 @@ extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); /* regular block cipher functions */ -asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); /* 2-way parallel cipher functions */ -asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); /* 16-way parallel cipher functions (avx/aes-ni) */ -asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); - -asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - -asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - -static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src) +asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); + +asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); + +asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); + +static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) { __camellia_enc_blk(ctx, dst, src, false); } -static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, - const u8 *src) +static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src) { __camellia_enc_blk(ctx, dst, src, true); } -static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, +static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src) { __camellia_enc_blk_2way(ctx, dst, src, false); } -static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, +static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst, const u8 *src) { __camellia_enc_blk_2way(ctx, dst, src, true); } /* glue helpers */ -extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src); -extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, +extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src); +extern void camellia_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, +extern void camellia_crypt_ctr_2way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); +extern void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +extern void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); #endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 8d4a8e1226ee..777c0f63418c 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -11,18 +11,13 @@ #include #include -typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); -typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); -typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, +typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_ctr_func_t)(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src, +typedef void (*common_glue_xts_func_t)(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) -#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) -#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) -#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn)) - struct common_glue_func_entry { unsigned int num_blocks; /* number of blocks that @fn will process */ union { @@ -116,7 +111,8 @@ extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, common_glue_func_t tweak_fn, void *tweak_ctx, void *crypt_ctx, bool decrypt); -extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, - le128 *iv, common_glue_func_t fn); +extern void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, + const u8 *src, le128 *iv, + common_glue_func_t fn); #endif /* _CRYPTO_GLUE_HELPER_H */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index db7c9cc32234..251c2c89d7cf 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -15,26 +15,26 @@ struct serpent_xts_ctx { struct serpent_ctx crypt_ctx; }; -asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); -asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, +extern void __serpent_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); +extern void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv); +extern void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv); extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/include/asm/crypto/serpent-sse2.h index 1a345e8a7496..860ca248914b 100644 --- a/arch/x86/include/asm/crypto/serpent-sse2.h +++ b/arch/x86/include/asm/crypto/serpent-sse2.h @@ -9,25 +9,23 @@ #define SERPENT_PARALLEL_BLOCKS 4 -asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst, const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst, const u8 *src); -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) +static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) { __serpent_enc_blk_4way(ctx, dst, src, false); } -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) +static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, + u8 *dst, const u8 *src) { __serpent_enc_blk_4way(ctx, dst, src, true); } -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) +static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) { serpent_dec_blk_4way(ctx, dst, src); } @@ -36,25 +34,23 @@ static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, #define SERPENT_PARALLEL_BLOCKS 8 -asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst, const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst, const u8 *src); -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) +static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) { __serpent_enc_blk_8way(ctx, dst, src, false); } -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) +static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, + u8 *dst, const u8 *src) { __serpent_enc_blk_8way(ctx, dst, src, true); } -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) +static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) { serpent_dec_blk_8way(ctx, dst, src); } diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index f618bf272b90..2c377a8042e1 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -7,22 +7,19 @@ #include /* regular block cipher functions from twofish_x86_64 module */ -asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src); /* 3-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src); /* helpers from twofish_x86_64-3way module */ -extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); -extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, +extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src); +extern void twofish_enc_blk_ctr(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, +extern void twofish_enc_blk_ctr_3way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); #endif /* ASM_X86_TWOFISH_H */ -- cgit v1.2.3 From 674f368a952c48ede71784935a799a5205b92b6c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Dec 2019 21:19:36 -0600 Subject: crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CRYPTO_TFM_RES_BAD_KEY_LEN flag was apparently meant as a way to make the ->setkey() functions provide more information about errors. However, no one actually checks for this flag, which makes it pointless. Also, many algorithms fail to set this flag when given a bad length key. Reviewing just the generic implementations, this is the case for aes-fixed-time, cbcmac, echainiv, nhpoly1305, pcrypt, rfc3686, rfc4309, rfc7539, rfc7539esp, salsa20, seqiv, and xcbc. But there are probably many more in arch/*/crypto/ and drivers/crypto/. Some algorithms can even set this flag when the key is the correct length. For example, authenc and authencesn set it when the key payload is malformed in any way (not just a bad length), the atmel-sha and ccree drivers can set it if a memory allocation fails, and the chelsio driver sets it for bad auth tag lengths, not just bad key lengths. So even if someone actually wanted to start checking this flag (which seems unlikely, since it's been unused for a long time), there would be a lot of work needed to get it working correctly. But it would probably be much better to go back to the drawing board and just define different return values, like -EINVAL if the key is invalid for the algorithm vs. -EKEYREJECTED if the key was rejected by a policy like "no weak keys". That would be much simpler, less error-prone, and easier to test. So just remove this flag. Signed-off-by: Eric Biggers Reviewed-by: Horia Geantă Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-ce-glue.c | 14 +------ arch/arm/crypto/crc32-ce-glue.c | 4 +- arch/arm/crypto/ghash-ce-glue.c | 4 +- arch/arm64/crypto/aes-ce-ccm-glue.c | 8 +--- arch/arm64/crypto/aes-ce-glue.c | 8 +--- arch/arm64/crypto/aes-glue.c | 31 +++----------- arch/arm64/crypto/ghash-ce-glue.c | 8 +--- arch/mips/crypto/crc32-mips.c | 4 +- arch/powerpc/crypto/aes-spe-glue.c | 18 ++------- arch/powerpc/crypto/crc32c-vpmsum_glue.c | 4 +- arch/s390/crypto/aes_s390.c | 4 +- arch/s390/crypto/crc32-vx.c | 8 +--- arch/s390/crypto/ghash_s390.c | 4 +- arch/s390/crypto/paes_s390.c | 25 +++--------- arch/sparc/crypto/aes_glue.c | 2 - arch/sparc/crypto/camellia_glue.c | 5 +-- arch/sparc/crypto/crc32c_glue.c | 4 +- arch/x86/crypto/aegis128-aesni-glue.c | 4 +- arch/x86/crypto/aesni-intel_glue.c | 10 ++--- arch/x86/crypto/blake2s-glue.c | 4 +- arch/x86/crypto/camellia_aesni_avx2_glue.c | 3 +- arch/x86/crypto/camellia_aesni_avx_glue.c | 9 ++--- arch/x86/crypto/camellia_glue.c | 9 ++--- arch/x86/crypto/cast6_avx_glue.c | 6 +-- arch/x86/crypto/crc32-pclmul_glue.c | 4 +- arch/x86/crypto/crc32c-intel_glue.c | 4 +- arch/x86/crypto/ghash-clmulni-intel_glue.c | 4 +- arch/x86/crypto/twofish_avx_glue.c | 6 +-- arch/x86/include/asm/crypto/camellia.h | 2 +- crypto/aegis128-core.c | 4 +- crypto/aes_generic.c | 18 +++------ crypto/anubis.c | 2 - crypto/authenc.c | 6 +-- crypto/authencesn.c | 6 +-- crypto/blake2b_generic.c | 4 +- crypto/blake2s_generic.c | 4 +- crypto/camellia_generic.c | 5 +-- crypto/cast6_generic.c | 10 ++--- crypto/cipher.c | 4 +- crypto/crc32_generic.c | 4 +- crypto/crc32c_generic.c | 4 +- crypto/essiv.c | 4 +- crypto/ghash-generic.c | 4 +- crypto/michael_mic.c | 4 +- crypto/skcipher.c | 4 +- crypto/sm4_generic.c | 16 +++----- crypto/twofish_common.c | 8 +--- crypto/vmac.c | 4 +- crypto/xxhash_generic.c | 4 +- .../crypto/allwinner/sun4i-ss/sun4i-ss-cipher.c | 1 - .../crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c | 1 - .../crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c | 2 - drivers/crypto/amcc/crypto4xx_alg.c | 11 ++--- drivers/crypto/amlogic/amlogic-gxl-cipher.c | 1 - drivers/crypto/atmel-aes.c | 9 +---- drivers/crypto/axis/artpec6_crypto.c | 8 +--- drivers/crypto/bcm/cipher.c | 3 -- drivers/crypto/caam/caamalg.c | 33 ++++----------- drivers/crypto/caam/caamalg_qi.c | 44 +++++--------------- drivers/crypto/caam/caamalg_qi2.c | 47 +++++----------------- drivers/crypto/caam/caamhash.c | 9 +---- drivers/crypto/cavium/cpt/cptvf_algs.c | 2 - drivers/crypto/cavium/nitrox/nitrox_aead.c | 4 +- drivers/crypto/cavium/nitrox/nitrox_skcipher.c | 12 ++---- drivers/crypto/ccp/ccp-crypto-aes-cmac.c | 1 - drivers/crypto/ccp/ccp-crypto-aes-galois.c | 1 - drivers/crypto/ccp/ccp-crypto-aes.c | 1 - drivers/crypto/ccp/ccp-crypto-sha.c | 4 +- drivers/crypto/ccree/cc_aead.c | 20 +++------ drivers/crypto/ccree/cc_cipher.c | 3 -- drivers/crypto/ccree/cc_hash.c | 6 --- drivers/crypto/chelsio/chcr_algo.c | 16 ++------ drivers/crypto/geode-aes.c | 8 +--- drivers/crypto/inside-secure/safexcel_cipher.c | 38 +++++------------ drivers/crypto/inside-secure/safexcel_hash.c | 16 ++------ drivers/crypto/ixp4xx_crypto.c | 3 -- drivers/crypto/marvell/cipher.c | 4 +- drivers/crypto/mediatek/mtk-aes.c | 2 - drivers/crypto/n2_core.c | 1 - drivers/crypto/padlock-aes.c | 9 +---- drivers/crypto/picoxcell_crypto.c | 6 +-- drivers/crypto/qat/qat_common/qat_algs.c | 6 +-- drivers/crypto/qce/sha.c | 2 - drivers/crypto/rockchip/rk3288_crypto_skcipher.c | 4 +- drivers/crypto/stm32/stm32-crc32.c | 4 +- drivers/crypto/talitos.c | 15 ++----- drivers/crypto/ux500/cryp/cryp_core.c | 2 - drivers/crypto/virtio/virtio_crypto_algs.c | 8 +--- include/crypto/cast6.h | 3 +- include/crypto/internal/des.h | 8 +--- include/crypto/twofish.h | 2 +- include/crypto/xts.h | 8 +--- include/linux/crypto.h | 1 - 93 files changed, 167 insertions(+), 561 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index cdb1a07e7ad0..b668c97663ec 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -138,14 +138,8 @@ static int ce_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); - int ret; - - ret = ce_aes_expandkey(ctx, in_key, key_len); - if (!ret) - return 0; - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; + return ce_aes_expandkey(ctx, in_key, key_len); } struct crypto_aes_xts_ctx { @@ -167,11 +161,7 @@ static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (!ret) ret = ce_aes_expandkey(&ctx->key2, &in_key[key_len / 2], key_len / 2); - if (!ret) - return 0; - - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; + return ret; } static int ecb_encrypt(struct skcipher_request *req) diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index 95592499b9bd..2208445808d7 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -54,10 +54,8 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index c691077679a6..7e8b2f55685c 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -163,10 +163,8 @@ static int ghash_setkey(struct crypto_shash *tfm, struct ghash_key *key = crypto_shash_ctx(tfm); be128 h; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } /* needed for the fallback */ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 541cf9165748..f6d19b0dc893 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -47,14 +47,8 @@ static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); - int ret; - ret = ce_aes_expandkey(ctx, in_key, key_len); - if (!ret) - return 0; - - tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; + return ce_aes_expandkey(ctx, in_key, key_len); } static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c index 6d085dc56c51..56a5f6f0b0c1 100644 --- a/arch/arm64/crypto/aes-ce-glue.c +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -143,14 +143,8 @@ int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - int ret; - ret = ce_aes_expandkey(ctx, in_key, key_len); - if (!ret) - return 0; - - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; + return ce_aes_expandkey(ctx, in_key, key_len); } EXPORT_SYMBOL(ce_aes_setkey); diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index aa57dc639f77..ed5409c6abf4 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -132,13 +132,8 @@ static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); - int ret; - - ret = aes_expandkey(ctx, in_key, key_len); - if (ret) - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return ret; + return aes_expandkey(ctx, in_key, key_len); } static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm, @@ -155,11 +150,7 @@ static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm, if (!ret) ret = aes_expandkey(&ctx->key2, &in_key[key_len / 2], key_len / 2); - if (!ret) - return 0; - - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; + return ret; } static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm, @@ -173,19 +164,12 @@ static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm, ret = aes_expandkey(&ctx->key1, in_key, key_len); if (ret) - goto out; + return ret; desc->tfm = ctx->hash; crypto_shash_digest(desc, in_key, key_len, digest); - ret = aes_expandkey(&ctx->key2, digest, sizeof(digest)); - if (ret) - goto out; - - return 0; -out: - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; + return aes_expandkey(&ctx->key2, digest, sizeof(digest)); } static int __maybe_unused ecb_encrypt(struct skcipher_request *req) @@ -791,13 +775,8 @@ static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key, unsigned int key_len) { struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); - int err; - err = aes_expandkey(&ctx->key, in_key, key_len); - if (err) - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - - return err; + return aes_expandkey(&ctx->key, in_key, key_len); } static void cmac_gf128_mul_by_x(be128 *y, const be128 *x) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 196aedd0c20c..22831d3b7f62 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -248,10 +248,8 @@ static int ghash_setkey(struct crypto_shash *tfm, { struct ghash_key *key = crypto_shash_ctx(tfm); - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } return __ghash_setkey(key, inkey, keylen); } @@ -306,10 +304,8 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, int ret; ret = aes_expandkey(&ctx->aes_key, inkey, keylen); - if (ret) { - tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (ret) return -EINVAL; - } aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){}); diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c index 7d1d2425746f..faa88a6a74c0 100644 --- a/arch/mips/crypto/crc32-mips.c +++ b/arch/mips/crypto/crc32-mips.c @@ -177,10 +177,8 @@ static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, { struct chksum_ctx *mctx = crypto_shash_ctx(tfm); - if (keylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(mctx->key)) return -EINVAL; - } mctx->key = get_unaligned_le32(key); return 0; } diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c index 1fad5d4c658d..c2b23b69d7b1 100644 --- a/arch/powerpc/crypto/aes-spe-glue.c +++ b/arch/powerpc/crypto/aes-spe-glue.c @@ -94,13 +94,6 @@ static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, { struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm); - if (key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - switch (key_len) { case AES_KEYSIZE_128: ctx->rounds = 4; @@ -114,6 +107,8 @@ static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, ctx->rounds = 6; ppc_expand_key_256(ctx->key_enc, in_key); break; + default: + return -EINVAL; } ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len); @@ -139,13 +134,6 @@ static int ppc_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, key_len >>= 1; - if (key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - switch (key_len) { case AES_KEYSIZE_128: ctx->rounds = 4; @@ -162,6 +150,8 @@ static int ppc_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, ppc_expand_key_256(ctx->key_enc, in_key); ppc_expand_key_256(ctx->key_twk, in_key + AES_KEYSIZE_256); break; + default: + return -EINVAL; } ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len); diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c index 2c232898b933..63760b7dbb76 100644 --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -73,10 +73,8 @@ static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index ead0b2c9881d..2db167e5871c 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -414,10 +414,8 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, return err; /* In fips mode only 128 bit or 256 bit keys are valid */ - if (fips_enabled && key_len != 32 && key_len != 64) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (fips_enabled && key_len != 32 && key_len != 64) return -EINVAL; - } /* Pick the correct function code based on the key length */ fc = (key_len == 32) ? CPACF_KM_XTS_128 : diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c index 423ee05887e6..fafecad20752 100644 --- a/arch/s390/crypto/crc32-vx.c +++ b/arch/s390/crypto/crc32-vx.c @@ -111,10 +111,8 @@ static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, { struct crc_ctx *mctx = crypto_shash_ctx(tfm); - if (newkeylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (newkeylen != sizeof(mctx->key)) return -EINVAL; - } mctx->key = le32_to_cpu(*(__le32 *)newkey); return 0; } @@ -124,10 +122,8 @@ static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, { struct crc_ctx *mctx = crypto_shash_ctx(tfm); - if (newkeylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (newkeylen != sizeof(mctx->key)) return -EINVAL; - } mctx->key = be32_to_cpu(*(__be32 *)newkey); return 0; } diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index a3e7400e031c..6b07a2f1ce8a 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -43,10 +43,8 @@ static int ghash_setkey(struct crypto_shash *tfm, { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } memcpy(ctx->key, key, GHASH_BLOCK_SIZE); diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index c7119c617b6e..e2a85783f804 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -151,11 +151,7 @@ static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (rc) return rc; - if (__paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return 0; + return __paes_set_key(ctx); } static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) @@ -254,11 +250,7 @@ static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (rc) return rc; - if (__cbc_paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return 0; + return __cbc_paes_set_key(ctx); } static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) @@ -386,10 +378,9 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (rc) return rc; - if (__xts_paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } + rc = __xts_paes_set_key(ctx); + if (rc) + return rc; /* * xts_check_key verifies the key length is not odd and makes @@ -526,11 +517,7 @@ static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (rc) return rc; - if (__ctr_paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return 0; + return __ctr_paes_set_key(ctx); } static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c index 0f5a501c95a9..e3d2138ff9e2 100644 --- a/arch/sparc/crypto/aes_glue.c +++ b/arch/sparc/crypto/aes_glue.c @@ -169,7 +169,6 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; switch (key_len) { case AES_KEYSIZE_128: @@ -188,7 +187,6 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, break; default: - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c index 1700f863748c..aaa9714378e6 100644 --- a/arch/sparc/crypto/camellia_glue.c +++ b/arch/sparc/crypto/camellia_glue.c @@ -39,12 +39,9 @@ static int camellia_set_key(struct crypto_tfm *tfm, const u8 *_in_key, { struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); const u32 *in_key = (const u32 *) _in_key; - u32 *flags = &tfm->crt_flags; - if (key_len != 16 && key_len != 24 && key_len != 32) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (key_len != 16 && key_len != 24 && key_len != 32) return -EINVAL; - } ctx->key_len = key_len; diff --git a/arch/sparc/crypto/crc32c_glue.c b/arch/sparc/crypto/crc32c_glue.c index 1299073285a3..4e9323229e71 100644 --- a/arch/sparc/crypto/crc32c_glue.c +++ b/arch/sparc/crypto/crc32c_glue.c @@ -33,10 +33,8 @@ static int crc32c_sparc64_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *(__le32 *)mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 46d227122643..4623189000d8 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -144,10 +144,8 @@ static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, { struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); - if (keylen != AEGIS128_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; - } memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 670f8fcf2544..bbbebbd35b5d 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -316,14 +316,11 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx); - u32 *flags = &tfm->crt_flags; int err; if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + key_len != AES_KEYSIZE_256) return -EINVAL; - } if (!crypto_simd_usable()) err = aes_expandkey(ctx, in_key, key_len); @@ -641,10 +638,9 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, { struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - if (key_len < 4) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (key_len < 4) return -EINVAL; - } + /*Account for 4 byte nonce at the end.*/ key_len -= 4; diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 1d9ff8a45e1f..06ef2d4a4701 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -64,10 +64,8 @@ static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, { struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); - if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) return -EINVAL; - } memcpy(tctx->key, key, keylen); tctx->keylen = keylen; diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index a8cc2c83fe1b..ccda647422d6 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -142,8 +142,7 @@ static const struct common_glue_ctx camellia_dec_xts = { static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, - &tfm->base.crt_flags); + return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 31a82a79f4ac..4e5de6ef206e 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -144,8 +144,7 @@ static const struct common_glue_ctx camellia_dec_xts = { static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, - &tfm->base.crt_flags); + return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) @@ -177,7 +176,6 @@ int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *flags = &tfm->base.crt_flags; int err; err = xts_verify_key(tfm, key, keylen); @@ -185,13 +183,12 @@ int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, return err; /* first half of xts-key is for crypt */ - err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2); if (err) return err; /* second half of xts-key is for tweak */ - return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } EXPORT_SYMBOL_GPL(xts_camellia_setkey); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 5f3ed5af68d7..242c056e5fa8 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1227,12 +1227,10 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey) } int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, - unsigned int key_len, u32 *flags) + unsigned int key_len) { - if (key_len != 16 && key_len != 24 && key_len != 32) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (key_len != 16 && key_len != 24 && key_len != 32) return -EINVAL; - } cctx->key_length = key_len; @@ -1255,8 +1253,7 @@ EXPORT_SYMBOL_GPL(__camellia_setkey); static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { - return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len, - &tfm->crt_flags); + return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len); } static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index da5297475f9e..48e0f37796fa 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -173,7 +173,6 @@ static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *flags = &tfm->base.crt_flags; int err; err = xts_verify_key(tfm, key, keylen); @@ -181,13 +180,12 @@ static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, return err; /* first half of xts-key is for crypt */ - err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2); if (err) return err; /* second half of xts-key is for tweak */ - return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } static int xts_encrypt(struct skcipher_request *req) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index cb4ab6645106..418bd88acac8 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -94,10 +94,8 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index eefa0862f309..c20d1b8a82c3 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -91,10 +91,8 @@ static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 04d72a5a8ce9..4a9c9833a7d6 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -57,10 +57,8 @@ static int ghash_setkey(struct crypto_shash *tfm, be128 *x = (be128 *)key; u64 a, b; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } /* perform multiplication by 'x' in GF(2^128) */ a = be64_to_cpu(x->a); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 3b36e97ec7ab..2dbc8ce3730e 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -64,7 +64,6 @@ static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *flags = &tfm->base.crt_flags; int err; err = xts_verify_key(tfm, key, keylen); @@ -72,13 +71,12 @@ static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, return err; /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2); if (err) return err; /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } static const struct common_glue_ctx twofish_enc = { diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index f1592619dd65..f6d91861cb14 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -26,7 +26,7 @@ struct camellia_xts_ctx { extern int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, - unsigned int key_len, u32 *flags); + unsigned int key_len); extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); diff --git a/crypto/aegis128-core.c b/crypto/aegis128-core.c index 71c11cb5bad1..44fb4956f0dd 100644 --- a/crypto/aegis128-core.c +++ b/crypto/aegis128-core.c @@ -372,10 +372,8 @@ static int crypto_aegis128_setkey(struct crypto_aead *aead, const u8 *key, { struct aegis_ctx *ctx = crypto_aead_ctx(aead); - if (keylen != AEGIS128_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; - } memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); return 0; diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index 22e5867177f1..27ab27931813 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c @@ -1127,24 +1127,18 @@ EXPORT_SYMBOL_GPL(crypto_it_tab); * @in_key: The input key. * @key_len: The size of the key. * - * Returns 0 on success, on failure the %CRYPTO_TFM_RES_BAD_KEY_LEN flag in tfm - * is set. The function uses aes_expand_key() to expand the key. - * &crypto_aes_ctx _must_ be the private data embedded in @tfm which is - * retrieved with crypto_tfm_ctx(). + * This function uses aes_expand_key() to expand the key. &crypto_aes_ctx + * _must_ be the private data embedded in @tfm which is retrieved with + * crypto_tfm_ctx(). + * + * Return: 0 on success; -EINVAL on failure (only happens for bad key lengths) */ int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; - int ret; - - ret = aes_expandkey(ctx, in_key, key_len); - if (!ret) - return 0; - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; + return aes_expandkey(ctx, in_key, key_len); } EXPORT_SYMBOL_GPL(crypto_aes_set_key); diff --git a/crypto/anubis.c b/crypto/anubis.c index f9ce78fde6ee..5da0241ef453 100644 --- a/crypto/anubis.c +++ b/crypto/anubis.c @@ -464,7 +464,6 @@ static int anubis_setkey(struct crypto_tfm *tfm, const u8 *in_key, { struct anubis_ctx *ctx = crypto_tfm_ctx(tfm); const __be32 *key = (const __be32 *)in_key; - u32 *flags = &tfm->crt_flags; int N, R, i, r; u32 kappa[ANUBIS_MAX_N]; u32 inter[ANUBIS_MAX_N]; @@ -474,7 +473,6 @@ static int anubis_setkey(struct crypto_tfm *tfm, const u8 *in_key, case 32: case 36: case 40: break; default: - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } diff --git a/crypto/authenc.c b/crypto/authenc.c index 3f0ed9402582..0da80632e872 100644 --- a/crypto/authenc.c +++ b/crypto/authenc.c @@ -91,7 +91,7 @@ static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key, int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) - goto badkey; + goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc) & @@ -113,10 +113,6 @@ static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key, out: memzero_explicit(&keys, sizeof(keys)); return err; - -badkey: - crypto_aead_set_flags(authenc, CRYPTO_TFM_RES_BAD_KEY_LEN); - goto out; } static void authenc_geniv_ahash_done(struct crypto_async_request *areq, int err) diff --git a/crypto/authencesn.c b/crypto/authencesn.c index adb7554fca29..749527e1b617 100644 --- a/crypto/authencesn.c +++ b/crypto/authencesn.c @@ -65,7 +65,7 @@ static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 * int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) - goto badkey; + goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) & @@ -87,10 +87,6 @@ static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 * out: memzero_explicit(&keys, sizeof(keys)); return err; - -badkey: - crypto_aead_set_flags(authenc_esn, CRYPTO_TFM_RES_BAD_KEY_LEN); - goto out; } static int crypto_authenc_esn_genicv_tail(struct aead_request *req, diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c index d04b1788dc42..1d262374fa4e 100644 --- a/crypto/blake2b_generic.c +++ b/crypto/blake2b_generic.c @@ -147,10 +147,8 @@ static int blake2b_setkey(struct crypto_shash *tfm, const u8 *key, { struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm); - if (keylen == 0 || keylen > BLAKE2B_KEYBYTES) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen == 0 || keylen > BLAKE2B_KEYBYTES) return -EINVAL; - } memcpy(tctx->key, key, keylen); tctx->keylen = keylen; diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c index ed0c74640470..005783ff45ad 100644 --- a/crypto/blake2s_generic.c +++ b/crypto/blake2s_generic.c @@ -17,10 +17,8 @@ static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, { struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); - if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) return -EINVAL; - } memcpy(tctx->key, key, keylen); tctx->keylen = keylen; diff --git a/crypto/camellia_generic.c b/crypto/camellia_generic.c index b6a1121e2478..9a5783e5196a 100644 --- a/crypto/camellia_generic.c +++ b/crypto/camellia_generic.c @@ -970,12 +970,9 @@ camellia_set_key(struct crypto_tfm *tfm, const u8 *in_key, { struct camellia_ctx *cctx = crypto_tfm_ctx(tfm); const unsigned char *key = (const unsigned char *)in_key; - u32 *flags = &tfm->crt_flags; - if (key_len != 16 && key_len != 24 && key_len != 32) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (key_len != 16 && key_len != 24 && key_len != 32) return -EINVAL; - } cctx->key_length = key_len; diff --git a/crypto/cast6_generic.c b/crypto/cast6_generic.c index 85328522c5ca..c77ff6c8a2b2 100644 --- a/crypto/cast6_generic.c +++ b/crypto/cast6_generic.c @@ -103,17 +103,14 @@ static inline void W(u32 *key, unsigned int i) key[7] ^= F2(key[0], Tr[i % 4][7], Tm[i][7]); } -int __cast6_setkey(struct cast6_ctx *c, const u8 *in_key, - unsigned key_len, u32 *flags) +int __cast6_setkey(struct cast6_ctx *c, const u8 *in_key, unsigned int key_len) { int i; u32 key[8]; __be32 p_key[8]; /* padded key */ - if (key_len % 4 != 0) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (key_len % 4 != 0) return -EINVAL; - } memset(p_key, 0, 32); memcpy(p_key, in_key, key_len); @@ -148,8 +145,7 @@ EXPORT_SYMBOL_GPL(__cast6_setkey); int cast6_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { - return __cast6_setkey(crypto_tfm_ctx(tfm), key, keylen, - &tfm->crt_flags); + return __cast6_setkey(crypto_tfm_ctx(tfm), key, keylen); } EXPORT_SYMBOL_GPL(cast6_setkey); diff --git a/crypto/cipher.c b/crypto/cipher.c index aadd51cb7250..0fb7042a709d 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -46,10 +46,8 @@ int crypto_cipher_setkey(struct crypto_cipher *tfm, unsigned long alignmask = crypto_cipher_alignmask(tfm); crypto_cipher_clear_flags(tfm, CRYPTO_TFM_RES_MASK); - if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) { - crypto_cipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; - } if ((unsigned long)key & alignmask) return setkey_unaligned(tfm, key, keylen); diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c index 9e97912280bd..0e103fb5dd77 100644 --- a/crypto/crc32_generic.c +++ b/crypto/crc32_generic.c @@ -60,10 +60,8 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = get_unaligned_le32(key); return 0; } diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c index 7b25fe82072c..7fa9b0788685 100644 --- a/crypto/crc32c_generic.c +++ b/crypto/crc32c_generic.c @@ -74,10 +74,8 @@ static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, { struct chksum_ctx *mctx = crypto_shash_ctx(tfm); - if (keylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(mctx->key)) return -EINVAL; - } mctx->key = get_unaligned_le32(key); return 0; } diff --git a/crypto/essiv.c b/crypto/essiv.c index e4b32c2ea7ec..f49bd6fc6972 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -117,10 +117,8 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, if (err) return err; - if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) { - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) return -EINVAL; - } desc->tfm = tctx->hash; err = crypto_shash_init(desc) ?: diff --git a/crypto/ghash-generic.c b/crypto/ghash-generic.c index 5027b3461c92..c70d163c1ac9 100644 --- a/crypto/ghash-generic.c +++ b/crypto/ghash-generic.c @@ -58,10 +58,8 @@ static int ghash_setkey(struct crypto_shash *tfm, struct ghash_ctx *ctx = crypto_shash_ctx(tfm); be128 k; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } if (ctx->gf128) gf128mul_free_4k(ctx->gf128); diff --git a/crypto/michael_mic.c b/crypto/michael_mic.c index 20e6220f46f6..63350c4ad461 100644 --- a/crypto/michael_mic.c +++ b/crypto/michael_mic.c @@ -137,10 +137,8 @@ static int michael_setkey(struct crypto_shash *tfm, const u8 *key, const __le32 *data = (const __le32 *)key; - if (keylen != 8) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != 8) return -EINVAL; - } mctx->l = le32_to_cpu(data[0]); mctx->r = le32_to_cpu(data[1]); diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 457e4ddc1482..8c8735f75478 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -603,10 +603,8 @@ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned long alignmask = crypto_skcipher_alignmask(tfm); int err; - if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) return -EINVAL; - } if ((unsigned long)key & alignmask) err = skcipher_setkey_unaligned(tfm, key, keylen); diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c index 71ffb343709a..016dbc595705 100644 --- a/crypto/sm4_generic.c +++ b/crypto/sm4_generic.c @@ -143,29 +143,23 @@ int crypto_sm4_expand_key(struct crypto_sm4_ctx *ctx, const u8 *in_key, EXPORT_SYMBOL_GPL(crypto_sm4_expand_key); /** - * crypto_sm4_set_key - Set the AES key. + * crypto_sm4_set_key - Set the SM4 key. * @tfm: The %crypto_tfm that is used in the context. * @in_key: The input key. * @key_len: The size of the key. * - * Returns 0 on success, on failure the %CRYPTO_TFM_RES_BAD_KEY_LEN flag in tfm - * is set. The function uses crypto_sm4_expand_key() to expand the key. + * This function uses crypto_sm4_expand_key() to expand the key. * &crypto_sm4_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). + * + * Return: 0 on success; -EINVAL on failure (only happens for bad key lengths) */ int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; - int ret; - - ret = crypto_sm4_expand_key(ctx, in_key, key_len); - if (!ret) - return 0; - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; + return crypto_sm4_expand_key(ctx, in_key, key_len); } EXPORT_SYMBOL_GPL(crypto_sm4_set_key); diff --git a/crypto/twofish_common.c b/crypto/twofish_common.c index 222fc765c57a..d23fa531b91f 100644 --- a/crypto/twofish_common.c +++ b/crypto/twofish_common.c @@ -567,7 +567,7 @@ static const u8 calc_sb_tbl[512] = { /* Perform the key setup. */ int __twofish_setkey(struct twofish_ctx *ctx, const u8 *key, - unsigned int key_len, u32 *flags) + unsigned int key_len) { int i, j, k; @@ -584,10 +584,7 @@ int __twofish_setkey(struct twofish_ctx *ctx, const u8 *key, /* Check key length. */ if (key_len % 8) - { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; /* unsupported key length */ - } /* Compute the first two words of the S vector. The magic numbers are * the entries of the RS matrix, preprocessed through poly_to_exp. The @@ -688,8 +685,7 @@ EXPORT_SYMBOL_GPL(__twofish_setkey); int twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { - return __twofish_setkey(crypto_tfm_ctx(tfm), key, key_len, - &tfm->crt_flags); + return __twofish_setkey(crypto_tfm_ctx(tfm), key, key_len); } EXPORT_SYMBOL_GPL(twofish_setkey); diff --git a/crypto/vmac.c b/crypto/vmac.c index f50a85060b39..0bbb34dc87c4 100644 --- a/crypto/vmac.c +++ b/crypto/vmac.c @@ -435,10 +435,8 @@ static int vmac_setkey(struct crypto_shash *tfm, unsigned int i; int err; - if (keylen != VMAC_KEY_LEN) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != VMAC_KEY_LEN) return -EINVAL; - } err = crypto_cipher_setkey(tctx->cipher, key, keylen); if (err) diff --git a/crypto/xxhash_generic.c b/crypto/xxhash_generic.c index 4aad2c0f40a9..55d1c8a76127 100644 --- a/crypto/xxhash_generic.c +++ b/crypto/xxhash_generic.c @@ -22,10 +22,8 @@ static int xxhash64_setkey(struct crypto_shash *tfm, const u8 *key, { struct xxhash64_tfm_ctx *tctx = crypto_shash_ctx(tfm); - if (keylen != sizeof(tctx->seed)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(tctx->seed)) return -EINVAL; - } tctx->seed = get_unaligned_le64(key); return 0; } diff --git a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-cipher.c b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-cipher.c index cb2b0874f68f..7f22d305178e 100644 --- a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-cipher.c @@ -541,7 +541,6 @@ int sun4i_ss_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, break; default: dev_dbg(ss->dev, "ERROR: Invalid keylen %u\n", keylen); - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } op->keylen = keylen; diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c index 37d0b6c386a0..b102da74b731 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c @@ -394,7 +394,6 @@ int sun8i_ce_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, break; default: dev_dbg(ce->dev, "ERROR: Invalid keylen %u\n", keylen); - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } if (op->key) { diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c index f222979a5623..84d52fc3a2da 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c @@ -390,7 +390,6 @@ int sun8i_ss_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, break; default: dev_dbg(ss->dev, "ERROR: Invalid keylen %u\n", keylen); - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } if (op->key) { @@ -416,7 +415,6 @@ int sun8i_ss_des3_setkey(struct crypto_skcipher *tfm, const u8 *key, if (unlikely(keylen != 3 * DES_KEY_SIZE)) { dev_dbg(ss->dev, "Invalid keylen %u\n", keylen); - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c index a42f8619589d..121eb81df64f 100644 --- a/drivers/crypto/amcc/crypto4xx_alg.c +++ b/drivers/crypto/amcc/crypto4xx_alg.c @@ -128,12 +128,9 @@ static int crypto4xx_setkey_aes(struct crypto_skcipher *cipher, struct dynamic_sa_ctl *sa; int rc; - if (keylen != AES_KEYSIZE_256 && - keylen != AES_KEYSIZE_192 && keylen != AES_KEYSIZE_128) { - crypto_skcipher_set_flags(cipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != AES_KEYSIZE_256 && keylen != AES_KEYSIZE_192 && + keylen != AES_KEYSIZE_128) return -EINVAL; - } /* Create SA */ if (ctx->sa_in || ctx->sa_out) @@ -551,10 +548,8 @@ int crypto4xx_setkey_aes_gcm(struct crypto_aead *cipher, struct dynamic_sa_ctl *sa; int rc = 0; - if (crypto4xx_aes_gcm_validate_keylen(keylen) != 0) { - crypto_aead_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (crypto4xx_aes_gcm_validate_keylen(keylen) != 0) return -EINVAL; - } rc = crypto4xx_aead_setup_fallback(ctx, cipher, key, keylen); if (rc) diff --git a/drivers/crypto/amlogic/amlogic-gxl-cipher.c b/drivers/crypto/amlogic/amlogic-gxl-cipher.c index e589015aac1c..9819dd50fbad 100644 --- a/drivers/crypto/amlogic/amlogic-gxl-cipher.c +++ b/drivers/crypto/amlogic/amlogic-gxl-cipher.c @@ -366,7 +366,6 @@ int meson_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, break; default: dev_dbg(mc->dev, "ERROR: Invalid keylen %u\n", keylen); - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } if (op->key) { diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c index b001fdcd9d95..898f66cb2eb2 100644 --- a/drivers/crypto/atmel-aes.c +++ b/drivers/crypto/atmel-aes.c @@ -1140,10 +1140,8 @@ static int atmel_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, if (keylen != AES_KEYSIZE_128 && keylen != AES_KEYSIZE_192 && - keylen != AES_KEYSIZE_256) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + keylen != AES_KEYSIZE_256) return -EINVAL; - } memcpy(ctx->key, key, keylen); ctx->keylen = keylen; @@ -1716,10 +1714,8 @@ static int atmel_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key, if (keylen != AES_KEYSIZE_256 && keylen != AES_KEYSIZE_192 && - keylen != AES_KEYSIZE_128) { - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + keylen != AES_KEYSIZE_128) return -EINVAL; - } memcpy(ctx->key, key, keylen); ctx->keylen = keylen; @@ -2073,7 +2069,6 @@ static int atmel_aes_authenc_setkey(struct crypto_aead *tfm, const u8 *key, return 0; badkey: - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index 22ebe40f09f5..fcf1effc7661 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -1249,10 +1249,8 @@ static int artpec6_crypto_aead_set_key(struct crypto_aead *tfm, const u8 *key, { struct artpec6_cryptotfm_context *ctx = crypto_tfm_ctx(&tfm->base); - if (len != 16 && len != 24 && len != 32) { - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (len != 16 && len != 24 && len != 32) return -EINVAL; - } ctx->key_length = len; @@ -1606,8 +1604,6 @@ artpec6_crypto_cipher_set_key(struct crypto_skcipher *cipher, const u8 *key, case 32: break; default: - crypto_skcipher_set_flags(cipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -1634,8 +1630,6 @@ artpec6_crypto_xts_set_key(struct crypto_skcipher *cipher, const u8 *key, case 64: break; default: - crypto_skcipher_set_flags(cipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 1564a6f8c9cb..184a3e1245cf 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -1846,7 +1846,6 @@ static int aes_setkey(struct crypto_skcipher *cipher, const u8 *key, ctx->cipher_type = CIPHER_TYPE_AES256; break; default: - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } WARN_ON((ctx->max_payload != SPU_MAX_PAYLOAD_INF) && @@ -2916,7 +2915,6 @@ badkey: ctx->authkeylen = 0; ctx->digestsize = 0; - crypto_aead_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -2992,7 +2990,6 @@ badkey: ctx->authkeylen = 0; ctx->digestsize = 0; - crypto_aead_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index 2912006b946b..ef1a65f4fc92 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -548,10 +548,8 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int ivsize = crypto_aead_ivsize(aead); unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; - if (keylen != CHACHA_KEY_SIZE + saltlen) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != CHACHA_KEY_SIZE + saltlen) return -EINVAL; - } ctx->cdata.key_virt = key; ctx->cdata.keylen = keylen - saltlen; @@ -619,7 +617,6 @@ skip_split_key: memzero_explicit(&keys, sizeof(keys)); return aead_set_sh_desc(aead); badkey: - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } @@ -649,10 +646,8 @@ static int gcm_setkey(struct crypto_aead *aead, int err; err = aes_check_keylen(keylen); - if (err) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } print_hex_dump_debug("key in @"__stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -672,10 +667,8 @@ static int rfc4106_setkey(struct crypto_aead *aead, int err; err = aes_check_keylen(keylen - 4); - if (err) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } print_hex_dump_debug("key in @"__stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -700,10 +693,8 @@ static int rfc4543_setkey(struct crypto_aead *aead, int err; err = aes_check_keylen(keylen - 4); - if (err) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } print_hex_dump_debug("key in @"__stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -762,11 +753,8 @@ static int aes_skcipher_setkey(struct crypto_skcipher *skcipher, int err; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, 0); } @@ -786,11 +774,8 @@ static int rfc3686_skcipher_setkey(struct crypto_skcipher *skcipher, keylen -= CTR_RFC3686_NONCE_SIZE; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, ctx1_iv_off); } @@ -809,11 +794,8 @@ static int ctr_skcipher_setkey(struct crypto_skcipher *skcipher, ctx1_iv_off = 16; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, ctx1_iv_off); } @@ -846,7 +828,6 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, u32 *desc; if (keylen != 2 * AES_MIN_KEY_SIZE && keylen != 2 * AES_MAX_KEY_SIZE) { - crypto_skcipher_set_flags(skcipher, CRYPTO_TFM_RES_BAD_KEY_LEN); dev_err(jrdev, "key size mismatch\n"); return -EINVAL; } diff --git a/drivers/crypto/caam/caamalg_qi.c b/drivers/crypto/caam/caamalg_qi.c index 8e3449670d2f..4a29e0ef9d63 100644 --- a/drivers/crypto/caam/caamalg_qi.c +++ b/drivers/crypto/caam/caamalg_qi.c @@ -268,7 +268,6 @@ skip_split_key: memzero_explicit(&keys, sizeof(keys)); return ret; badkey: - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } @@ -356,10 +355,8 @@ static int gcm_setkey(struct crypto_aead *aead, int ret; ret = aes_check_keylen(keylen); - if (ret) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } print_hex_dump_debug("key in @" __stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -462,10 +459,8 @@ static int rfc4106_setkey(struct crypto_aead *aead, int ret; ret = aes_check_keylen(keylen - 4); - if (ret) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } print_hex_dump_debug("key in @" __stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -570,10 +565,8 @@ static int rfc4543_setkey(struct crypto_aead *aead, int ret; ret = aes_check_keylen(keylen - 4); - if (ret) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } print_hex_dump_debug("key in @" __stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -644,7 +637,7 @@ static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, ctx->sh_desc_enc); if (ret) { dev_err(jrdev, "driver enc context update failed\n"); - goto badkey; + return -EINVAL; } } @@ -653,14 +646,11 @@ static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, ctx->sh_desc_dec); if (ret) { dev_err(jrdev, "driver dec context update failed\n"); - goto badkey; + return -EINVAL; } } return ret; -badkey: - crypto_skcipher_set_flags(skcipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; } static int aes_skcipher_setkey(struct crypto_skcipher *skcipher, @@ -669,11 +659,8 @@ static int aes_skcipher_setkey(struct crypto_skcipher *skcipher, int err; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, 0); } @@ -693,11 +680,8 @@ static int rfc3686_skcipher_setkey(struct crypto_skcipher *skcipher, keylen -= CTR_RFC3686_NONCE_SIZE; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, ctx1_iv_off); } @@ -716,11 +700,8 @@ static int ctr_skcipher_setkey(struct crypto_skcipher *skcipher, ctx1_iv_off = 16; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, ctx1_iv_off); } @@ -748,7 +729,7 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, if (keylen != 2 * AES_MIN_KEY_SIZE && keylen != 2 * AES_MAX_KEY_SIZE) { dev_err(jrdev, "key size mismatch\n"); - goto badkey; + return -EINVAL; } ctx->cdata.keylen = keylen; @@ -765,7 +746,7 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, ctx->sh_desc_enc); if (ret) { dev_err(jrdev, "driver enc context update failed\n"); - goto badkey; + return -EINVAL; } } @@ -774,14 +755,11 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, ctx->sh_desc_dec); if (ret) { dev_err(jrdev, "driver dec context update failed\n"); - goto badkey; + return -EINVAL; } } return ret; -badkey: - crypto_skcipher_set_flags(skcipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; } /* diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index 3aeacc36ce23..fe2a628e8905 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -313,7 +313,6 @@ static int aead_setkey(struct crypto_aead *aead, const u8 *key, memzero_explicit(&keys, sizeof(keys)); return aead_set_sh_desc(aead); badkey: - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } @@ -326,11 +325,11 @@ static int des3_aead_setkey(struct crypto_aead *aead, const u8 *key, err = crypto_authenc_extractkeys(&keys, key, keylen); if (unlikely(err)) - goto badkey; + goto out; err = -EINVAL; if (keys.enckeylen != DES3_EDE_KEY_SIZE) - goto badkey; + goto out; err = crypto_des3_ede_verify_key(crypto_aead_tfm(aead), keys.enckey) ?: aead_setkey(aead, key, keylen); @@ -338,10 +337,6 @@ static int des3_aead_setkey(struct crypto_aead *aead, const u8 *key, out: memzero_explicit(&keys, sizeof(keys)); return err; - -badkey: - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - goto out; } static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, @@ -634,10 +629,8 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int ivsize = crypto_aead_ivsize(aead); unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; - if (keylen != CHACHA_KEY_SIZE + saltlen) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != CHACHA_KEY_SIZE + saltlen) return -EINVAL; - } ctx->cdata.key_virt = key; ctx->cdata.keylen = keylen - saltlen; @@ -725,10 +718,8 @@ static int gcm_setkey(struct crypto_aead *aead, int ret; ret = aes_check_keylen(keylen); - if (ret) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } print_hex_dump_debug("key in @" __stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -822,10 +813,8 @@ static int rfc4106_setkey(struct crypto_aead *aead, int ret; ret = aes_check_keylen(keylen - 4); - if (ret) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } print_hex_dump_debug("key in @" __stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -923,10 +912,8 @@ static int rfc4543_setkey(struct crypto_aead *aead, int ret; ret = aes_check_keylen(keylen - 4); - if (ret) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } print_hex_dump_debug("key in @" __stringify(__LINE__)": ", DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1); @@ -992,11 +979,8 @@ static int aes_skcipher_setkey(struct crypto_skcipher *skcipher, int err; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, 0); } @@ -1016,11 +1000,8 @@ static int rfc3686_skcipher_setkey(struct crypto_skcipher *skcipher, keylen -= CTR_RFC3686_NONCE_SIZE; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, ctx1_iv_off); } @@ -1039,11 +1020,8 @@ static int ctr_skcipher_setkey(struct crypto_skcipher *skcipher, ctx1_iv_off = 16; err = aes_check_keylen(keylen); - if (err) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } return skcipher_setkey(skcipher, key, keylen, ctx1_iv_off); } @@ -1051,11 +1029,8 @@ static int ctr_skcipher_setkey(struct crypto_skcipher *skcipher, static int chacha20_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen) { - if (keylen != CHACHA_KEY_SIZE) { - crypto_skcipher_set_flags(skcipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != CHACHA_KEY_SIZE) return -EINVAL; - } return skcipher_setkey(skcipher, key, keylen, 0); } @@ -1084,7 +1059,6 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, if (keylen != 2 * AES_MIN_KEY_SIZE && keylen != 2 * AES_MAX_KEY_SIZE) { dev_err(dev, "key size mismatch\n"); - crypto_skcipher_set_flags(skcipher, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -3277,7 +3251,6 @@ static int ahash_setkey(struct crypto_ahash *ahash, const u8 *key, return ret; bad_free_key: kfree(hashed_key); - crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 50a8852ad276..8d9143407fc5 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -473,7 +473,6 @@ static int ahash_setkey(struct crypto_ahash *ahash, return ahash_set_sh_desc(ahash); bad_free_key: kfree(hashed_key); - crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -483,10 +482,8 @@ static int axcbc_setkey(struct crypto_ahash *ahash, const u8 *key, struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); struct device *jrdev = ctx->jrdev; - if (keylen != AES_KEYSIZE_128) { - crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != AES_KEYSIZE_128) return -EINVAL; - } memcpy(ctx->key, key, keylen); dma_sync_single_for_device(jrdev, ctx->adata.key_dma, keylen, @@ -506,10 +503,8 @@ static int acmac_setkey(struct crypto_ahash *ahash, const u8 *key, int err; err = aes_check_keylen(keylen); - if (err) { - crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (err) return err; - } /* key is immediate data for all cmac shared descriptors */ ctx->adata.key_virt = key; diff --git a/drivers/crypto/cavium/cpt/cptvf_algs.c b/drivers/crypto/cavium/cpt/cptvf_algs.c index 1ad66677d88e..1be1adffff1d 100644 --- a/drivers/crypto/cavium/cpt/cptvf_algs.c +++ b/drivers/crypto/cavium/cpt/cptvf_algs.c @@ -295,8 +295,6 @@ static int cvm_setkey(struct crypto_skcipher *cipher, const u8 *key, memcpy(ctx->enc_key, key, keylen); return 0; } else { - crypto_skcipher_set_flags(cipher, - CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } } diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c index 6f80cc3b5c84..dce5423a5883 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_aead.c +++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c @@ -40,10 +40,8 @@ static int nitrox_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, union fc_ctx_flags flags; aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (aes_keylen < 0) return -EINVAL; - } /* fill crypto context */ fctx = nctx->u.fctx; diff --git a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c index 97af4d50d003..18088b0a2257 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c +++ b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c @@ -200,10 +200,8 @@ static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, int aes_keylen; aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (aes_keylen < 0) return -EINVAL; - } return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); } @@ -351,10 +349,8 @@ static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher, keylen /= 2; aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (aes_keylen < 0) return -EINVAL; - } fctx = nctx->u.fctx; /* copy KEY2 */ @@ -382,10 +378,8 @@ static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher, keylen -= CTR_RFC3686_NONCE_SIZE; aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (aes_keylen < 0) return -EINVAL; - } return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); } diff --git a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c index 32f19f402073..5eba7ee49e81 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c @@ -276,7 +276,6 @@ static int ccp_aes_cmac_setkey(struct crypto_ahash *tfm, const u8 *key, ctx->u.aes.type = CCP_AES_TYPE_256; break; default: - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } ctx->u.aes.mode = alg->mode; diff --git a/drivers/crypto/ccp/ccp-crypto-aes-galois.c b/drivers/crypto/ccp/ccp-crypto-aes-galois.c index ff50ee80d223..9e8f07c1afac 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-galois.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c @@ -42,7 +42,6 @@ static int ccp_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key, ctx->u.aes.type = CCP_AES_TYPE_256; break; default: - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/ccp/ccp-crypto-aes.c b/drivers/crypto/ccp/ccp-crypto-aes.c index 33328a153225..51e12fbd1159 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes.c +++ b/drivers/crypto/ccp/ccp-crypto-aes.c @@ -51,7 +51,6 @@ static int ccp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, ctx->u.aes.type = CCP_AES_TYPE_256; break; default: - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } ctx->u.aes.mode = alg->mode; diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c index 453b9797f93f..474e6f1a6a84 100644 --- a/drivers/crypto/ccp/ccp-crypto-sha.c +++ b/drivers/crypto/ccp/ccp-crypto-sha.c @@ -293,10 +293,8 @@ static int ccp_sha_setkey(struct crypto_ahash *tfm, const u8 *key, ret = crypto_shash_digest(sdesc, key, key_len, ctx->u.sha.key); - if (ret) { - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return -EINVAL; - } key_len = digest_size; } else { diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c index b0085db7e211..d014c8e063a7 100644 --- a/drivers/crypto/ccree/cc_aead.c +++ b/drivers/crypto/ccree/cc_aead.c @@ -562,7 +562,7 @@ static int cc_aead_setkey(struct crypto_aead *tfm, const u8 *key, rc = crypto_authenc_extractkeys(&keys, key, keylen); if (rc) - goto badkey; + return rc; enckey = keys.enckey; authkey = keys.authkey; ctx->enc_keylen = keys.enckeylen; @@ -570,10 +570,9 @@ static int cc_aead_setkey(struct crypto_aead *tfm, const u8 *key, if (ctx->cipher_mode == DRV_CIPHER_CTR) { /* the nonce is stored in bytes at end of key */ - rc = -EINVAL; if (ctx->enc_keylen < (AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE)) - goto badkey; + return -EINVAL; /* Copy nonce from last 4 bytes in CTR key to * first 4 bytes in CTR IV */ @@ -591,7 +590,7 @@ static int cc_aead_setkey(struct crypto_aead *tfm, const u8 *key, rc = validate_keys_sizes(ctx); if (rc) - goto badkey; + return rc; /* STAT_PHASE_1: Copy key to ctx */ @@ -605,7 +604,7 @@ static int cc_aead_setkey(struct crypto_aead *tfm, const u8 *key, } else if (ctx->auth_mode != DRV_HASH_NULL) { /* HMAC */ rc = cc_get_plain_hmac_key(tfm, authkey, ctx->auth_keylen); if (rc) - goto badkey; + return rc; } /* STAT_PHASE_2: Create sequence */ @@ -622,8 +621,7 @@ static int cc_aead_setkey(struct crypto_aead *tfm, const u8 *key, break; /* No auth. key setup */ default: dev_err(dev, "Unsupported authenc (%d)\n", ctx->auth_mode); - rc = -ENOTSUPP; - goto badkey; + return -ENOTSUPP; } /* STAT_PHASE_3: Submit sequence to HW */ @@ -632,18 +630,12 @@ static int cc_aead_setkey(struct crypto_aead *tfm, const u8 *key, rc = cc_send_sync_request(ctx->drvdata, &cc_req, desc, seq_len); if (rc) { dev_err(dev, "send_request() failed (rc=%d)\n", rc); - goto setkey_error; + return rc; } } /* Update STAT_PHASE_3 */ return rc; - -badkey: - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - -setkey_error: - return rc; } static int cc_des3_aead_setkey(struct crypto_aead *aead, const u8 *key, diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c index 61b9dcaa0c05..7493a32f12b9 100644 --- a/drivers/crypto/ccree/cc_cipher.c +++ b/drivers/crypto/ccree/cc_cipher.c @@ -291,7 +291,6 @@ static int cc_cipher_sethkey(struct crypto_skcipher *sktfm, const u8 *key, /* This check the size of the protected key token */ if (keylen != sizeof(hki)) { dev_err(dev, "Unsupported protected key size %d.\n", keylen); - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -304,7 +303,6 @@ static int cc_cipher_sethkey(struct crypto_skcipher *sktfm, const u8 *key, if (validate_keys_sizes(ctx_p, keylen)) { dev_err(dev, "Unsupported key size %d.\n", keylen); - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -395,7 +393,6 @@ static int cc_cipher_setkey(struct crypto_skcipher *sktfm, const u8 *key, if (validate_keys_sizes(ctx_p, keylen)) { dev_err(dev, "Unsupported key size %d.\n", keylen); - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c index aee5db5f8538..912e5ce5079d 100644 --- a/drivers/crypto/ccree/cc_hash.c +++ b/drivers/crypto/ccree/cc_hash.c @@ -899,9 +899,6 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key, rc = cc_send_sync_request(ctx->drvdata, &cc_req, desc, idx); out: - if (rc) - crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN); - if (ctx->key_params.key_dma_addr) { dma_unmap_single(dev, ctx->key_params.key_dma_addr, ctx->key_params.keylen, DMA_TO_DEVICE); @@ -990,9 +987,6 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash, rc = cc_send_sync_request(ctx->drvdata, &cc_req, desc, idx); - if (rc) - crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN); - dma_unmap_single(dev, ctx->key_params.key_dma_addr, ctx->key_params.keylen, DMA_TO_DEVICE); dev_dbg(dev, "Unmapped key-buffer: key_dma_addr=%pad keylen=%u\n", diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 5b7dbe7cdb17..720b2ff55464 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -912,7 +912,6 @@ static int chcr_aes_cbc_setkey(struct crypto_skcipher *cipher, ablkctx->ciph_mode = CHCR_SCMD_CIPHER_MODE_AES_CBC; return 0; badkey_err: - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); ablkctx->enckey_len = 0; return err; @@ -943,7 +942,6 @@ static int chcr_aes_ctr_setkey(struct crypto_skcipher *cipher, return 0; badkey_err: - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); ablkctx->enckey_len = 0; return err; @@ -981,7 +979,6 @@ static int chcr_aes_rfc3686_setkey(struct crypto_skcipher *cipher, return 0; badkey_err: - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); ablkctx->enckey_len = 0; return err; @@ -2174,7 +2171,6 @@ static int chcr_aes_xts_setkey(struct crypto_skcipher *cipher, const u8 *key, ablkctx->ciph_mode = CHCR_SCMD_CIPHER_MODE_AES_XTS; return 0; badkey_err: - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); ablkctx->enckey_len = 0; return err; @@ -3284,7 +3280,6 @@ static int chcr_ccm_common_setkey(struct crypto_aead *aead, ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256; mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_256; } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); aeadctx->enckey_len = 0; return -EINVAL; } @@ -3322,7 +3317,6 @@ static int chcr_aead_rfc4309_setkey(struct crypto_aead *aead, const u8 *key, int error; if (keylen < 3) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); aeadctx->enckey_len = 0; return -EINVAL; } @@ -3372,7 +3366,6 @@ static int chcr_gcm_setkey(struct crypto_aead *aead, const u8 *key, } else if (keylen == AES_KEYSIZE_256) { ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256; } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); pr_err("GCM: Invalid key length %d\n", keylen); ret = -EINVAL; goto out; @@ -3429,10 +3422,8 @@ static int chcr_authenc_setkey(struct crypto_aead *authenc, const u8 *key, if (err) goto out; - if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) { - crypto_aead_set_flags(authenc, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; - } if (get_alg_config(¶m, max_authsize)) { pr_err("chcr : Unsupported digest size\n"); @@ -3559,10 +3550,9 @@ static int chcr_aead_digest_null_setkey(struct crypto_aead *authenc, if (err) goto out; - if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) { - crypto_aead_set_flags(authenc, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; - } + subtype = get_aead_subtype(authenc); if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA || subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) { diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c index 73a899e6f837..eb6e6b618361 100644 --- a/drivers/crypto/geode-aes.c +++ b/drivers/crypto/geode-aes.c @@ -119,11 +119,9 @@ static int geode_setkey_cip(struct crypto_tfm *tfm, const u8 *key, return 0; } - if (len != AES_KEYSIZE_192 && len != AES_KEYSIZE_256) { + if (len != AES_KEYSIZE_192 && len != AES_KEYSIZE_256) /* not supported at all */ - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; - } /* * The requested key size is not supported by HW, do a fallback @@ -154,11 +152,9 @@ static int geode_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return 0; } - if (len != AES_KEYSIZE_192 && len != AES_KEYSIZE_256) { + if (len != AES_KEYSIZE_192 && len != AES_KEYSIZE_256) /* not supported at all */ - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; - } /* * The requested key size is not supported by HW, do a fallback diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index f4ece0d8bd6c..5ee66532f336 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -380,10 +380,8 @@ static int safexcel_skcipher_aes_setkey(struct crypto_skcipher *ctfm, int ret, i; ret = aes_expandkey(&aes, key, len); - if (ret) { - crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } if (priv->flags & EIP197_TRC_CACHE && ctx->base.ctxr_dma) { for (i = 0; i < len / sizeof(u32); i++) { @@ -433,12 +431,12 @@ static int safexcel_aead_setkey(struct crypto_aead *ctfm, const u8 *key, case SAFEXCEL_DES: err = verify_aead_des_key(ctfm, keys.enckey, keys.enckeylen); if (unlikely(err)) - goto badkey_expflags; + goto badkey; break; case SAFEXCEL_3DES: err = verify_aead_des3_key(ctfm, keys.enckey, keys.enckeylen); if (unlikely(err)) - goto badkey_expflags; + goto badkey; break; case SAFEXCEL_AES: err = aes_expandkey(&aes, keys.enckey, keys.enckeylen); @@ -521,8 +519,6 @@ static int safexcel_aead_setkey(struct crypto_aead *ctfm, const u8 *key, return 0; badkey: - crypto_aead_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); -badkey_expflags: memzero_explicit(&keys, sizeof(keys)); return err; } @@ -1444,10 +1440,8 @@ static int safexcel_skcipher_aesctr_setkey(struct crypto_skcipher *ctfm, /* exclude the nonce here */ keylen = len - CTR_RFC3686_NONCE_SIZE; ret = aes_expandkey(&aes, key, keylen); - if (ret) { - crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } if (priv->flags & EIP197_TRC_CACHE && ctx->base.ctxr_dma) { for (i = 0; i < keylen / sizeof(u32); i++) { @@ -2459,10 +2453,8 @@ static int safexcel_skcipher_aesxts_setkey(struct crypto_skcipher *ctfm, /* Only half of the key data is cipher key */ keylen = (len >> 1); ret = aes_expandkey(&aes, key, keylen); - if (ret) { - crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } if (priv->flags & EIP197_TRC_CACHE && ctx->base.ctxr_dma) { for (i = 0; i < keylen / sizeof(u32); i++) { @@ -2478,10 +2470,8 @@ static int safexcel_skcipher_aesxts_setkey(struct crypto_skcipher *ctfm, /* The other half is the tweak key */ ret = aes_expandkey(&aes, (u8 *)(key + keylen), keylen); - if (ret) { - crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } if (priv->flags & EIP197_TRC_CACHE && ctx->base.ctxr_dma) { for (i = 0; i < keylen / sizeof(u32); i++) { @@ -2570,7 +2560,6 @@ static int safexcel_aead_gcm_setkey(struct crypto_aead *ctfm, const u8 *key, ret = aes_expandkey(&aes, key, len); if (ret) { - crypto_aead_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&aes, sizeof(aes)); return ret; } @@ -2684,7 +2673,6 @@ static int safexcel_aead_ccm_setkey(struct crypto_aead *ctfm, const u8 *key, ret = aes_expandkey(&aes, key, len); if (ret) { - crypto_aead_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&aes, sizeof(aes)); return ret; } @@ -2815,10 +2803,9 @@ static int safexcel_skcipher_chacha20_setkey(struct crypto_skcipher *ctfm, { struct safexcel_cipher_ctx *ctx = crypto_skcipher_ctx(ctfm); - if (len != CHACHA_KEY_SIZE) { - crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (len != CHACHA_KEY_SIZE) return -EINVAL; - } + safexcel_chacha20_setkey(ctx, key); return 0; @@ -2872,10 +2859,9 @@ static int safexcel_aead_chachapoly_setkey(struct crypto_aead *ctfm, len -= EIP197_AEAD_IPSEC_NONCE_SIZE; ctx->nonce = *(u32 *)(key + len); } - if (len != CHACHA_KEY_SIZE) { - crypto_aead_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (len != CHACHA_KEY_SIZE) return -EINVAL; - } + safexcel_chacha20_setkey(ctx, key); return 0; @@ -3070,10 +3056,8 @@ static int safexcel_skcipher_sm4_setkey(struct crypto_skcipher *ctfm, struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->priv; - if (len != SM4_KEY_SIZE) { - crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (len != SM4_KEY_SIZE) return -EINVAL; - } if (priv->flags & EIP197_TRC_CACHE && ctx->base.ctxr_dma) if (memcmp(ctx->key, key, SM4_KEY_SIZE)) diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 25e49d1c96e8..088d7f8aab5e 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -1919,10 +1919,8 @@ static int safexcel_crc32_setkey(struct crypto_ahash *tfm, const u8 *key, { struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(crypto_ahash_tfm(tfm)); - if (keylen != sizeof(u32)) { - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } memcpy(ctx->ipad, key, sizeof(u32)); return 0; @@ -1995,10 +1993,8 @@ static int safexcel_cbcmac_setkey(struct crypto_ahash *tfm, const u8 *key, int ret, i; ret = aes_expandkey(&aes, key, len); - if (ret) { - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } memset(ctx->ipad, 0, 2 * AES_BLOCK_SIZE); for (i = 0; i < len / sizeof(u32); i++) @@ -2065,10 +2061,8 @@ static int safexcel_xcbcmac_setkey(struct crypto_ahash *tfm, const u8 *key, int ret, i; ret = aes_expandkey(&aes, key, len); - if (ret) { - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } /* precompute the XCBC key material */ crypto_cipher_clear_flags(ctx->kaes, CRYPTO_TFM_REQ_MASK); @@ -2168,10 +2162,8 @@ static int safexcel_cmac_setkey(struct crypto_ahash *tfm, const u8 *key, int ret, i; ret = aes_expandkey(&aes, key, len); - if (ret) { - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } for (i = 0; i < len / sizeof(u32); i++) ctx->ipad[i + 8] = diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index 391e3b4df364..f64bde506ae8 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -740,7 +740,6 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt, u32 keylen_cfg = 0; struct ix_sa_dir *dir; struct ixp_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; dir = encrypt ? &ctx->encrypt : &ctx->decrypt; cinfo = dir->npe_ctx; @@ -757,7 +756,6 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt, case 24: keylen_cfg = MOD_AES192; break; case 32: keylen_cfg = MOD_AES256; break; default: - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } cipher_cfg |= keylen_cfg; @@ -1169,7 +1167,6 @@ static int aead_setkey(struct crypto_aead *tfm, const u8 *key, memzero_explicit(&keys, sizeof(keys)); return aead_setup(tfm, crypto_aead_authsize(tfm)); badkey: - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } diff --git a/drivers/crypto/marvell/cipher.c b/drivers/crypto/marvell/cipher.c index d8e8c857770c..c24f34a48cef 100644 --- a/drivers/crypto/marvell/cipher.c +++ b/drivers/crypto/marvell/cipher.c @@ -255,10 +255,8 @@ static int mv_cesa_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, int i; ret = aes_expandkey(&ctx->aes, key, len); - if (ret) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return ret; - } remaining = (ctx->aes.key_length - 16) / 4; offset = ctx->aes.key_length + 24 - remaining; diff --git a/drivers/crypto/mediatek/mtk-aes.c b/drivers/crypto/mediatek/mtk-aes.c index 90880a81c534..00e580bf8536 100644 --- a/drivers/crypto/mediatek/mtk-aes.c +++ b/drivers/crypto/mediatek/mtk-aes.c @@ -652,7 +652,6 @@ static int mtk_aes_setkey(struct crypto_skcipher *tfm, break; default: - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -1022,7 +1021,6 @@ static int mtk_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, break; default: - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index 63bd565048f4..f5c468f2cc82 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -746,7 +746,6 @@ static int n2_aes_setkey(struct crypto_skcipher *skcipher, const u8 *key, ctx->enc_type |= ENC_TYPE_ALG_AES256; break; default: - crypto_skcipher_set_flags(skcipher, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c index c5b60f50e1b5..594d6b1695d5 100644 --- a/drivers/crypto/padlock-aes.c +++ b/drivers/crypto/padlock-aes.c @@ -108,14 +108,11 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, { struct aes_ctx *ctx = aes_ctx(tfm); const __le32 *key = (const __le32 *)in_key; - u32 *flags = &tfm->crt_flags; struct crypto_aes_ctx gen_aes; int cpu; - if (key_len % 8) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (key_len % 8) return -EINVAL; - } /* * If the hardware is capable of generating the extended key @@ -146,10 +143,8 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, ctx->cword.encrypt.keygen = 1; ctx->cword.decrypt.keygen = 1; - if (aes_expandkey(&gen_aes, in_key, key_len)) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (aes_expandkey(&gen_aes, in_key, key_len)) return -EINVAL; - } memcpy(ctx->E, gen_aes.key_enc, AES_MAX_KEYLENGTH); memcpy(ctx->D, gen_aes.key_dec, AES_MAX_KEYLENGTH); diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c index d187312b9864..ced4cbed9ea0 100644 --- a/drivers/crypto/picoxcell_crypto.c +++ b/drivers/crypto/picoxcell_crypto.c @@ -490,7 +490,6 @@ static int spacc_aead_setkey(struct crypto_aead *tfm, const u8 *key, return 0; badkey: - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } @@ -780,10 +779,8 @@ static int spacc_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, struct spacc_ablk_ctx *ctx = crypto_tfm_ctx(tfm); int err = 0; - if (len > AES_MAX_KEY_SIZE) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (len > AES_MAX_KEY_SIZE) return -EINVAL; - } /* * IPSec engine only supports 128 and 256 bit AES keys. If we get a @@ -830,7 +827,6 @@ static int spacc_kasumi_f8_setkey(struct crypto_skcipher *cipher, int err = 0; if (len > AES_MAX_KEY_SIZE) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); err = -EINVAL; goto out; } diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index 35bca76b640f..833bb1d3a11b 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -570,7 +570,6 @@ static int qat_alg_aead_init_sessions(struct crypto_aead *tfm, const u8 *key, memzero_explicit(&keys, sizeof(keys)); return 0; bad_key: - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; error: @@ -586,14 +585,11 @@ static int qat_alg_skcipher_init_sessions(struct qat_alg_skcipher_ctx *ctx, int alg; if (qat_alg_validate_key(keylen, &alg, mode)) - goto bad_key; + return -EINVAL; qat_alg_skcipher_init_enc(ctx, alg, key, keylen, mode); qat_alg_skcipher_init_dec(ctx, alg, key, keylen, mode); return 0; -bad_key: - crypto_skcipher_set_flags(ctx->tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; } static int qat_alg_aead_rekey(struct crypto_aead *tfm, const uint8_t *key, diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c index 95ab16fc8fd6..1ab62e7d5f3c 100644 --- a/drivers/crypto/qce/sha.c +++ b/drivers/crypto/qce/sha.c @@ -396,8 +396,6 @@ static int qce_ahash_hmac_setkey(struct crypto_ahash *tfm, const u8 *key, ahash_request_set_crypt(req, &sg, ctx->authkey, keylen); ret = crypto_wait_req(crypto_ahash_digest(req), &wait); - if (ret) - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); kfree(buf); err_free_req: diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index ca4de4ddfe1f..4a75c8e1fa6c 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -34,10 +34,8 @@ static int rk_aes_setkey(struct crypto_skcipher *cipher, struct rk_cipher_ctx *ctx = crypto_tfm_ctx(tfm); if (keylen != AES_KEYSIZE_128 && keylen != AES_KEYSIZE_192 && - keylen != AES_KEYSIZE_256) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + keylen != AES_KEYSIZE_256) return -EINVAL; - } ctx->keylen = keylen; memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_KEY_0, key, keylen); return 0; diff --git a/drivers/crypto/stm32/stm32-crc32.c b/drivers/crypto/stm32/stm32-crc32.c index 9e11c3480353..8e92e4ac79f1 100644 --- a/drivers/crypto/stm32/stm32-crc32.c +++ b/drivers/crypto/stm32/stm32-crc32.c @@ -85,10 +85,8 @@ static int stm32_crc_setkey(struct crypto_shash *tfm, const u8 *key, { struct stm32_crc_ctx *mctx = crypto_shash_ctx(tfm); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } mctx->key = get_unaligned_le32(key); return 0; diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index d71d65846e47..9c6db7f698c4 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -914,7 +914,6 @@ static int aead_setkey(struct crypto_aead *authenc, return 0; badkey: - crypto_aead_set_flags(authenc, CRYPTO_TFM_RES_BAD_KEY_LEN); memzero_explicit(&keys, sizeof(keys)); return -EINVAL; } @@ -929,11 +928,11 @@ static int aead_des3_setkey(struct crypto_aead *authenc, err = crypto_authenc_extractkeys(&keys, key, keylen); if (unlikely(err)) - goto badkey; + goto out; err = -EINVAL; if (keys.authkeylen + keys.enckeylen > TALITOS_MAX_KEY_SIZE) - goto badkey; + goto out; err = verify_aead_des3_key(authenc, keys.enckey, keys.enckeylen); if (err) @@ -954,10 +953,6 @@ static int aead_des3_setkey(struct crypto_aead *authenc, out: memzero_explicit(&keys, sizeof(keys)); return err; - -badkey: - crypto_aead_set_flags(authenc, CRYPTO_TFM_RES_BAD_KEY_LEN); - goto out; } static void talitos_sg_unmap(struct device *dev, @@ -1528,8 +1523,6 @@ static int skcipher_aes_setkey(struct crypto_skcipher *cipher, keylen == AES_KEYSIZE_256) return skcipher_setkey(cipher, key, keylen); - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; } @@ -2234,10 +2227,8 @@ static int ahash_setkey(struct crypto_ahash *tfm, const u8 *key, /* Must get the hash of the long key */ ret = keyhash(tfm, key, keylen, hash); - if (ret) { - crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (ret) return -EINVAL; - } keysize = digestsize; memcpy(ctx->key, hash, digestsize); diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c index 95fb694a2667..800dfc4d16c4 100644 --- a/drivers/crypto/ux500/cryp/cryp_core.c +++ b/drivers/crypto/ux500/cryp/cryp_core.c @@ -951,7 +951,6 @@ static int aes_skcipher_setkey(struct crypto_skcipher *cipher, const u8 *key, unsigned int keylen) { struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - u32 *flags = &cipher->base.crt_flags; pr_debug(DEV_DBG_NAME " [%s]", __func__); @@ -970,7 +969,6 @@ static int aes_skcipher_setkey(struct crypto_skcipher *cipher, default: pr_err(DEV_DBG_NAME "[%s]: Unknown keylen!", __func__); - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } diff --git a/drivers/crypto/virtio/virtio_crypto_algs.c b/drivers/crypto/virtio/virtio_crypto_algs.c index 4b71e80951b7..fd045e64972a 100644 --- a/drivers/crypto/virtio/virtio_crypto_algs.c +++ b/drivers/crypto/virtio/virtio_crypto_algs.c @@ -272,11 +272,11 @@ static int virtio_crypto_alg_skcipher_init_sessions( if (keylen > vcrypto->max_cipher_key_len) { pr_err("virtio_crypto: the key is too long\n"); - goto bad_key; + return -EINVAL; } if (virtio_crypto_alg_validate_key(keylen, &alg)) - goto bad_key; + return -EINVAL; /* Create encryption session */ ret = virtio_crypto_alg_skcipher_init_session(ctx, @@ -291,10 +291,6 @@ static int virtio_crypto_alg_skcipher_init_sessions( return ret; } return 0; - -bad_key: - crypto_skcipher_set_flags(ctx->tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; } /* Note: kernel crypto API realization */ diff --git a/include/crypto/cast6.h b/include/crypto/cast6.h index 4c8d0c72f78d..38f490cd50a8 100644 --- a/include/crypto/cast6.h +++ b/include/crypto/cast6.h @@ -15,8 +15,7 @@ struct cast6_ctx { u8 Kr[12][4]; }; -int __cast6_setkey(struct cast6_ctx *ctx, const u8 *key, - unsigned int keylen, u32 *flags); +int __cast6_setkey(struct cast6_ctx *ctx, const u8 *key, unsigned int keylen); int cast6_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); void __cast6_encrypt(const void *ctx, u8 *dst, const u8 *src); diff --git a/include/crypto/internal/des.h b/include/crypto/internal/des.h index f62a2bb1866b..355ddaae3806 100644 --- a/include/crypto/internal/des.h +++ b/include/crypto/internal/des.h @@ -120,20 +120,16 @@ static inline int verify_skcipher_des3_key(struct crypto_skcipher *tfm, static inline int verify_aead_des_key(struct crypto_aead *tfm, const u8 *key, int keylen) { - if (keylen != DES_KEY_SIZE) { - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != DES_KEY_SIZE) return -EINVAL; - } return crypto_des_verify_key(crypto_aead_tfm(tfm), key); } static inline int verify_aead_des3_key(struct crypto_aead *tfm, const u8 *key, int keylen) { - if (keylen != DES3_EDE_KEY_SIZE) { - crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; - } return crypto_des3_ede_verify_key(crypto_aead_tfm(tfm), key); } diff --git a/include/crypto/twofish.h b/include/crypto/twofish.h index 2e2c09673d88..f6b307a58554 100644 --- a/include/crypto/twofish.h +++ b/include/crypto/twofish.h @@ -19,7 +19,7 @@ struct twofish_ctx { }; int __twofish_setkey(struct twofish_ctx *ctx, const u8 *key, - unsigned int key_len, u32 *flags); + unsigned int key_len); int twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len); #endif diff --git a/include/crypto/xts.h b/include/crypto/xts.h index 15ae7fdc0478..57b2c52928db 100644 --- a/include/crypto/xts.h +++ b/include/crypto/xts.h @@ -17,10 +17,8 @@ static inline int xts_check_key(struct crypto_tfm *tfm, * key consists of keys of equal size concatenated, therefore * the length must be even. */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (keylen % 2) return -EINVAL; - } /* ensure that the AES and tweak key are not identical */ if (fips_enabled && @@ -39,10 +37,8 @@ static inline int xts_verify_key(struct crypto_skcipher *tfm, * key consists of keys of equal size concatenated, therefore * the length must be even. */ - if (keylen % 2) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen % 2) return -EINVAL; - } /* ensure that the AES and tweak key are not identical */ if ((fips_enabled || (crypto_skcipher_get_flags(tfm) & diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 719a301af3f2..61fccc7d0efb 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -113,7 +113,6 @@ #define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 #define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 #define CRYPTO_TFM_RES_WEAK_KEY 0x00100000 -#define CRYPTO_TFM_RES_BAD_KEY_LEN 0x00200000 /* * Miscellaneous stuff. -- cgit v1.2.3 From af5034e8e4a5838fc77e476c1a91822e449d5869 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Dec 2019 21:19:38 -0600 Subject: crypto: remove propagation of CRYPTO_TFM_RES_* flags The CRYPTO_TFM_RES_* flags were apparently meant as a way to make the ->setkey() functions provide more information about errors. But these flags weren't actually being used or tested, and in many cases they weren't being set correctly anyway. So they've now been removed. Also, if someone ever actually needs to start better distinguishing ->setkey() errors (which is somewhat unlikely, as this has been unneeded for a long time), we'd be much better off just defining different return values, like -EINVAL if the key is invalid for the algorithm vs. -EKEYREJECTED if the key was rejected by a policy like "no weak keys". That would be much simpler, less error-prone, and easier to test. So just remove CRYPTO_TFM_RES_MASK and all the unneeded logic that propagates these flags around. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/ghash-ce-glue.c | 7 +------ arch/s390/crypto/aes_s390.c | 23 +++-------------------- arch/x86/crypto/ghash-clmulni-intel_glue.c | 7 +------ crypto/adiantum.c | 8 -------- crypto/authenc.c | 6 ------ crypto/authencesn.c | 6 ------ crypto/ccm.c | 20 ++++---------------- crypto/chacha20poly1305.c | 7 +------ crypto/cipher.c | 1 - crypto/cryptd.c | 13 ++----------- crypto/ctr.c | 7 +------ crypto/cts.c | 6 +----- crypto/essiv.c | 22 ++++------------------ crypto/gcm.c | 19 ++----------------- crypto/lrw.c | 2 -- crypto/simd.c | 12 ++---------- crypto/skcipher.c | 6 +----- crypto/xts.c | 8 +------- drivers/crypto/amcc/crypto4xx_alg.c | 20 ++------------------ drivers/crypto/atmel-aes.c | 5 +---- drivers/crypto/atmel-authenc.h | 3 +-- drivers/crypto/atmel-sha.c | 11 +++-------- drivers/crypto/bcm/cipher.c | 14 ++------------ drivers/crypto/chelsio/chcr_algo.c | 24 +----------------------- drivers/crypto/geode-aes.c | 16 ++-------------- drivers/crypto/inside-secure/safexcel_cipher.c | 5 ----- drivers/crypto/inside-secure/safexcel_hash.c | 6 ------ drivers/crypto/mediatek/mtk-aes.c | 2 -- drivers/crypto/mxs-dcp.c | 12 +----------- drivers/crypto/picoxcell_crypto.c | 9 --------- drivers/crypto/sahara.c | 9 +-------- include/linux/crypto.h | 2 -- 32 files changed, 38 insertions(+), 280 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 7e8b2f55685c..a00fd329255f 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -294,16 +294,11 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, { struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_ahash *child = &ctx->cryptd_tfm->base; - int err; crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_ahash_setkey(child, key, keylen); - crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) - & CRYPTO_TFM_RES_MASK); - - return err; + return crypto_ahash_setkey(child, key, keylen); } static int ghash_async_init_tfm(struct crypto_tfm *tfm) diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 2db167e5871c..1c23d84a9097 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -72,19 +72,12 @@ static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - int ret; sctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; sctx->fallback.cip->base.crt_flags |= (tfm->crt_flags & CRYPTO_TFM_REQ_MASK); - ret = crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (sctx->fallback.cip->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } - return ret; + return crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len); } static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, @@ -182,18 +175,13 @@ static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); - int ret; crypto_skcipher_clear_flags(sctx->fallback.skcipher, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(sctx->fallback.skcipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - ret = crypto_skcipher_setkey(sctx->fallback.skcipher, key, len); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(sctx->fallback.skcipher) & - CRYPTO_TFM_RES_MASK); - return ret; + return crypto_skcipher_setkey(sctx->fallback.skcipher, key, len); } static int fallback_skcipher_crypt(struct s390_aes_ctx *sctx, @@ -389,17 +377,12 @@ static int xts_fallback_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); - int ret; crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(xts_ctx->fallback, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(xts_ctx->fallback) & - CRYPTO_TFM_RES_MASK); - return ret; + return crypto_skcipher_setkey(xts_ctx->fallback, key, len); } static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 4a9c9833a7d6..a4b728518e28 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -255,16 +255,11 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, { struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_ahash *child = &ctx->cryptd_tfm->base; - int err; crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_ahash_setkey(child, key, keylen); - crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) - & CRYPTO_TFM_RES_MASK); - - return err; + return crypto_ahash_setkey(child, key, keylen); } static int ghash_async_init_tfm(struct crypto_tfm *tfm) diff --git a/crypto/adiantum.c b/crypto/adiantum.c index aded26092268..30cffb45b88f 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -135,9 +135,6 @@ static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(tctx->streamcipher, key, keylen); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(tctx->streamcipher) & - CRYPTO_TFM_RES_MASK); if (err) return err; @@ -167,9 +164,6 @@ static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key, CRYPTO_TFM_REQ_MASK); err = crypto_cipher_setkey(tctx->blockcipher, keyp, BLOCKCIPHER_KEY_SIZE); - crypto_skcipher_set_flags(tfm, - crypto_cipher_get_flags(tctx->blockcipher) & - CRYPTO_TFM_RES_MASK); if (err) goto out; keyp += BLOCKCIPHER_KEY_SIZE; @@ -182,8 +176,6 @@ static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key, crypto_shash_set_flags(tctx->hash, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_shash_setkey(tctx->hash, keyp, NHPOLY1305_KEY_SIZE); - crypto_skcipher_set_flags(tfm, crypto_shash_get_flags(tctx->hash) & - CRYPTO_TFM_RES_MASK); keyp += NHPOLY1305_KEY_SIZE; WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]); out: diff --git a/crypto/authenc.c b/crypto/authenc.c index 0da80632e872..15aaddd34171 100644 --- a/crypto/authenc.c +++ b/crypto/authenc.c @@ -97,9 +97,6 @@ static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key, crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); - crypto_aead_set_flags(authenc, crypto_ahash_get_flags(auth) & - CRYPTO_TFM_RES_MASK); - if (err) goto out; @@ -107,9 +104,6 @@ static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key, crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); - crypto_aead_set_flags(authenc, crypto_skcipher_get_flags(enc) & - CRYPTO_TFM_RES_MASK); - out: memzero_explicit(&keys, sizeof(keys)); return err; diff --git a/crypto/authencesn.c b/crypto/authencesn.c index 749527e1b617..fc81324ce881 100644 --- a/crypto/authencesn.c +++ b/crypto/authencesn.c @@ -71,9 +71,6 @@ static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 * crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); - crypto_aead_set_flags(authenc_esn, crypto_ahash_get_flags(auth) & - CRYPTO_TFM_RES_MASK); - if (err) goto out; @@ -81,9 +78,6 @@ static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 * crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); - crypto_aead_set_flags(authenc_esn, crypto_skcipher_get_flags(enc) & - CRYPTO_TFM_RES_MASK); - out: memzero_explicit(&keys, sizeof(keys)); return err; diff --git a/crypto/ccm.c b/crypto/ccm.c index 380eb619f657..44104524e95a 100644 --- a/crypto/ccm.c +++ b/crypto/ccm.c @@ -91,26 +91,19 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key, struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_skcipher *ctr = ctx->ctr; struct crypto_ahash *mac = ctx->mac; - int err = 0; + int err; crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); - crypto_aead_set_flags(aead, crypto_skcipher_get_flags(ctr) & - CRYPTO_TFM_RES_MASK); if (err) - goto out; + return err; crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); - err = crypto_ahash_setkey(mac, key, keylen); - crypto_aead_set_flags(aead, crypto_ahash_get_flags(mac) & - CRYPTO_TFM_RES_MASK); - -out: - return err; + return crypto_ahash_setkey(mac, key, keylen); } static int crypto_ccm_setauthsize(struct crypto_aead *tfm, @@ -604,7 +597,6 @@ static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key, { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; - int err; if (keylen < 3) return -EINVAL; @@ -615,11 +607,7 @@ static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key, crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_aead_setkey(child, key, keylen); - crypto_aead_set_flags(parent, crypto_aead_get_flags(child) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4309_setauthsize(struct crypto_aead *parent, diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index 74e824e537e6..88cbdaba43b8 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -477,7 +477,6 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct chachapoly_ctx *ctx = crypto_aead_ctx(aead); - int err; if (keylen != ctx->saltlen + CHACHA_KEY_SIZE) return -EINVAL; @@ -488,11 +487,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, crypto_skcipher_clear_flags(ctx->chacha, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctx->chacha, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); - - err = crypto_skcipher_setkey(ctx->chacha, key, keylen); - crypto_aead_set_flags(aead, crypto_skcipher_get_flags(ctx->chacha) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_skcipher_setkey(ctx->chacha, key, keylen); } static int chachapoly_setauthsize(struct crypto_aead *tfm, diff --git a/crypto/cipher.c b/crypto/cipher.c index 0fb7042a709d..fd78150deb1c 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -45,7 +45,6 @@ int crypto_cipher_setkey(struct crypto_cipher *tfm, struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); - crypto_cipher_clear_flags(tfm, CRYPTO_TFM_RES_MASK); if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 2c6649b10923..cd94243a1605 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -252,17 +252,12 @@ static int cryptd_skcipher_setkey(struct crypto_skcipher *parent, { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_sync_skcipher *child = ctx->child; - int err; crypto_sync_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_sync_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_sync_skcipher_setkey(child, key, keylen); - crypto_skcipher_set_flags(parent, - crypto_sync_skcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_sync_skcipher_setkey(child, key, keylen); } static void cryptd_skcipher_complete(struct skcipher_request *req, int err) @@ -491,15 +486,11 @@ static int cryptd_hash_setkey(struct crypto_ahash *parent, { struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(parent); struct crypto_shash *child = ctx->child; - int err; crypto_shash_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_shash_set_flags(child, crypto_ahash_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_shash_setkey(child, key, keylen); - crypto_ahash_set_flags(parent, crypto_shash_get_flags(child) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_shash_setkey(child, key, keylen); } static int cryptd_hash_enqueue(struct ahash_request *req, diff --git a/crypto/ctr.c b/crypto/ctr.c index 1e9d6b86b3c6..b63b19de68a9 100644 --- a/crypto/ctr.c +++ b/crypto/ctr.c @@ -170,7 +170,6 @@ static int crypto_rfc3686_setkey(struct crypto_skcipher *parent, { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; - int err; /* the nonce is stored in bytes at end of key */ if (keylen < CTR_RFC3686_NONCE_SIZE) @@ -184,11 +183,7 @@ static int crypto_rfc3686_setkey(struct crypto_skcipher *parent, crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_skcipher_setkey(child, key, keylen); - crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_skcipher_setkey(child, key, keylen); } static int crypto_rfc3686_crypt(struct skcipher_request *req) diff --git a/crypto/cts.c b/crypto/cts.c index 6b6087dbb62a..a0bb994f8b11 100644 --- a/crypto/cts.c +++ b/crypto/cts.c @@ -78,15 +78,11 @@ static int crypto_cts_setkey(struct crypto_skcipher *parent, const u8 *key, { struct crypto_cts_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; - int err; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_skcipher_setkey(child, key, keylen); - crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_skcipher_setkey(child, key, keylen); } static void cts_cbc_crypt_done(struct crypto_async_request *areq, int err) diff --git a/crypto/essiv.c b/crypto/essiv.c index f49bd6fc6972..61d9000ae4ad 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -75,9 +75,6 @@ static int essiv_skcipher_setkey(struct crypto_skcipher *tfm, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(tctx->u.skcipher, key, keylen); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(tctx->u.skcipher) & - CRYPTO_TFM_RES_MASK); if (err) return err; @@ -90,13 +87,8 @@ static int essiv_skcipher_setkey(struct crypto_skcipher *tfm, crypto_cipher_set_flags(tctx->essiv_cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_cipher_setkey(tctx->essiv_cipher, salt, - crypto_shash_digestsize(tctx->hash)); - crypto_skcipher_set_flags(tfm, - crypto_cipher_get_flags(tctx->essiv_cipher) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_cipher_setkey(tctx->essiv_cipher, salt, + crypto_shash_digestsize(tctx->hash)); } static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, @@ -112,8 +104,6 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_aead_set_flags(tctx->u.aead, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_aead_setkey(tctx->u.aead, key, keylen); - crypto_aead_set_flags(tfm, crypto_aead_get_flags(tctx->u.aead) & - CRYPTO_TFM_RES_MASK); if (err) return err; @@ -130,12 +120,8 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_cipher_setkey(tctx->essiv_cipher, salt, - crypto_shash_digestsize(tctx->hash)); - crypto_aead_set_flags(tfm, crypto_cipher_get_flags(tctx->essiv_cipher) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_cipher_setkey(tctx->essiv_cipher, salt, + crypto_shash_digestsize(tctx->hash)); } static int essiv_aead_setauthsize(struct crypto_aead *tfm, diff --git a/crypto/gcm.c b/crypto/gcm.c index 73884208f075..7041cb1b6fd5 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -111,8 +111,6 @@ static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key, crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); - crypto_aead_set_flags(aead, crypto_skcipher_get_flags(ctr) & - CRYPTO_TFM_RES_MASK); if (err) return err; @@ -141,9 +139,6 @@ static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key, crypto_ahash_set_flags(ghash, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(ghash, (u8 *)&data->hash, sizeof(be128)); - crypto_aead_set_flags(aead, crypto_ahash_get_flags(ghash) & - CRYPTO_TFM_RES_MASK); - out: kzfree(data); return err; @@ -727,7 +722,6 @@ static int crypto_rfc4106_setkey(struct crypto_aead *parent, const u8 *key, { struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; - int err; if (keylen < 4) return -EINVAL; @@ -738,11 +732,7 @@ static int crypto_rfc4106_setkey(struct crypto_aead *parent, const u8 *key, crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_aead_setkey(child, key, keylen); - crypto_aead_set_flags(parent, crypto_aead_get_flags(child) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4106_setauthsize(struct crypto_aead *parent, @@ -956,7 +946,6 @@ static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key, { struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; - int err; if (keylen < 4) return -EINVAL; @@ -967,11 +956,7 @@ static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key, crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_aead_setkey(child, key, keylen); - crypto_aead_set_flags(parent, crypto_aead_get_flags(child) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4543_setauthsize(struct crypto_aead *parent, diff --git a/crypto/lrw.c b/crypto/lrw.c index be829f6afc8e..8ebd79276c78 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -79,8 +79,6 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(child, key, keylen - bsize); - crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); if (err) return err; diff --git a/crypto/simd.c b/crypto/simd.c index 48876266cf2d..56885af49c24 100644 --- a/crypto/simd.c +++ b/crypto/simd.c @@ -52,15 +52,11 @@ static int simd_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child = &ctx->cryptd_tfm->base; - int err; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_skcipher_setkey(child, key, key_len); - crypto_skcipher_set_flags(tfm, crypto_skcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_skcipher_setkey(child, key, key_len); } static int simd_skcipher_encrypt(struct skcipher_request *req) @@ -295,15 +291,11 @@ static int simd_aead_setkey(struct crypto_aead *tfm, const u8 *key, { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *child = &ctx->cryptd_tfm->base; - int err; crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_aead_setkey(child, key, key_len); - crypto_aead_set_flags(tfm, crypto_aead_get_flags(child) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_aead_setkey(child, key, key_len); } static int simd_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 8c8735f75478..89137a197fc8 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -876,15 +876,11 @@ static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); - int err; crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_cipher_setkey(cipher, key, keylen); - crypto_skcipher_set_flags(tfm, crypto_cipher_get_flags(cipher) & - CRYPTO_TFM_RES_MASK); - return err; + return crypto_cipher_setkey(cipher, key, keylen); } static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm) diff --git a/crypto/xts.c b/crypto/xts.c index ab117633d64e..19d55489e78b 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -61,8 +61,6 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, crypto_cipher_set_flags(tweak, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); err = crypto_cipher_setkey(tweak, key + keylen, keylen); - crypto_skcipher_set_flags(parent, crypto_cipher_get_flags(tweak) & - CRYPTO_TFM_RES_MASK); if (err) return err; @@ -71,11 +69,7 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); - err = crypto_skcipher_setkey(child, key, keylen); - crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); - - return err; + return crypto_skcipher_setkey(child, key, keylen); } /* diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c index 121eb81df64f..f7fc0c464125 100644 --- a/drivers/crypto/amcc/crypto4xx_alg.c +++ b/drivers/crypto/amcc/crypto4xx_alg.c @@ -289,19 +289,11 @@ static int crypto4xx_sk_setup_fallback(struct crypto4xx_ctx *ctx, const u8 *key, unsigned int keylen) { - int rc; - crypto_sync_skcipher_clear_flags(ctx->sw_cipher.cipher, CRYPTO_TFM_REQ_MASK); crypto_sync_skcipher_set_flags(ctx->sw_cipher.cipher, crypto_skcipher_get_flags(cipher) & CRYPTO_TFM_REQ_MASK); - rc = crypto_sync_skcipher_setkey(ctx->sw_cipher.cipher, key, keylen); - crypto_skcipher_clear_flags(cipher, CRYPTO_TFM_RES_MASK); - crypto_skcipher_set_flags(cipher, - crypto_sync_skcipher_get_flags(ctx->sw_cipher.cipher) & - CRYPTO_TFM_RES_MASK); - - return rc; + return crypto_sync_skcipher_setkey(ctx->sw_cipher.cipher, key, keylen); } int crypto4xx_setkey_aes_ctr(struct crypto_skcipher *cipher, @@ -376,18 +368,10 @@ static int crypto4xx_aead_setup_fallback(struct crypto4xx_ctx *ctx, const u8 *key, unsigned int keylen) { - int rc; - crypto_aead_clear_flags(ctx->sw_cipher.aead, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(ctx->sw_cipher.aead, crypto_aead_get_flags(cipher) & CRYPTO_TFM_REQ_MASK); - rc = crypto_aead_setkey(ctx->sw_cipher.aead, key, keylen); - crypto_aead_clear_flags(cipher, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(cipher, - crypto_aead_get_flags(ctx->sw_cipher.aead) & - CRYPTO_TFM_RES_MASK); - - return rc; + return crypto_aead_setkey(ctx->sw_cipher.aead, key, keylen); } /** diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c index 898f66cb2eb2..466c15b474da 100644 --- a/drivers/crypto/atmel-aes.c +++ b/drivers/crypto/atmel-aes.c @@ -2041,7 +2041,6 @@ static int atmel_aes_authenc_setkey(struct crypto_aead *tfm, const u8 *key, { struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_authenc_keys keys; - u32 flags; int err; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) @@ -2051,11 +2050,9 @@ static int atmel_aes_authenc_setkey(struct crypto_aead *tfm, const u8 *key, goto badkey; /* Save auth key. */ - flags = crypto_aead_get_flags(tfm); err = atmel_sha_authenc_setkey(ctx->auth, keys.authkey, keys.authkeylen, - &flags); - crypto_aead_set_flags(tfm, flags & CRYPTO_TFM_RES_MASK); + crypto_aead_get_flags(tfm)); if (err) { memzero_explicit(&keys, sizeof(keys)); return err; diff --git a/drivers/crypto/atmel-authenc.h b/drivers/crypto/atmel-authenc.h index d6de810df44f..c6530a1c8c20 100644 --- a/drivers/crypto/atmel-authenc.h +++ b/drivers/crypto/atmel-authenc.h @@ -30,8 +30,7 @@ unsigned int atmel_sha_authenc_get_reqsize(void); struct atmel_sha_authenc_ctx *atmel_sha_authenc_spawn(unsigned long mode); void atmel_sha_authenc_free(struct atmel_sha_authenc_ctx *auth); int atmel_sha_authenc_setkey(struct atmel_sha_authenc_ctx *auth, - const u8 *key, unsigned int keylen, - u32 *flags); + const u8 *key, unsigned int keylen, u32 flags); int atmel_sha_authenc_schedule(struct ahash_request *req, struct atmel_sha_authenc_ctx *auth, diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c index d3bcd14201c2..079fdb8114e9 100644 --- a/drivers/crypto/atmel-sha.c +++ b/drivers/crypto/atmel-sha.c @@ -2207,18 +2207,13 @@ void atmel_sha_authenc_free(struct atmel_sha_authenc_ctx *auth) EXPORT_SYMBOL_GPL(atmel_sha_authenc_free); int atmel_sha_authenc_setkey(struct atmel_sha_authenc_ctx *auth, - const u8 *key, unsigned int keylen, - u32 *flags) + const u8 *key, unsigned int keylen, u32 flags) { struct crypto_ahash *tfm = auth->tfm; - int err; crypto_ahash_clear_flags(tfm, CRYPTO_TFM_REQ_MASK); - crypto_ahash_set_flags(tfm, *flags & CRYPTO_TFM_REQ_MASK); - err = crypto_ahash_setkey(tfm, key, keylen); - *flags = crypto_ahash_get_flags(tfm); - - return err; + crypto_ahash_set_flags(tfm, flags & CRYPTO_TFM_REQ_MASK); + return crypto_ahash_setkey(tfm, key, keylen); } EXPORT_SYMBOL_GPL(atmel_sha_authenc_setkey); diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 184a3e1245cf..c8b9408541a9 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -2893,13 +2893,8 @@ static int aead_authenc_setkey(struct crypto_aead *cipher, ctx->fallback_cipher->base.crt_flags |= tfm->crt_flags & CRYPTO_TFM_REQ_MASK; ret = crypto_aead_setkey(ctx->fallback_cipher, key, keylen); - if (ret) { + if (ret) flow_log(" fallback setkey() returned:%d\n", ret); - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= - (ctx->fallback_cipher->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } } ctx->spu_resp_hdr_len = spu->spu_response_hdr_len(ctx->authkeylen, @@ -2965,13 +2960,8 @@ static int aead_gcm_ccm_setkey(struct crypto_aead *cipher, tfm->crt_flags & CRYPTO_TFM_REQ_MASK; ret = crypto_aead_setkey(ctx->fallback_cipher, key, keylen + ctx->salt_len); - if (ret) { + if (ret) flow_log(" fallback setkey() returned:%d\n", ret); - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= - (ctx->fallback_cipher->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } } ctx->spu_resp_hdr_len = spu->spu_response_hdr_len(ctx->authkeylen, diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 720b2ff55464..b4b9b22125d1 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -870,20 +870,13 @@ static int chcr_cipher_fallback_setkey(struct crypto_skcipher *cipher, const u8 *key, unsigned int keylen) { - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); struct ablk_ctx *ablkctx = ABLK_CTX(c_ctx(cipher)); - int err = 0; crypto_sync_skcipher_clear_flags(ablkctx->sw_cipher, CRYPTO_TFM_REQ_MASK); crypto_sync_skcipher_set_flags(ablkctx->sw_cipher, cipher->base.crt_flags & CRYPTO_TFM_REQ_MASK); - err = crypto_sync_skcipher_setkey(ablkctx->sw_cipher, key, keylen); - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= - crypto_sync_skcipher_get_flags(ablkctx->sw_cipher) & - CRYPTO_TFM_RES_MASK; - return err; + return crypto_sync_skcipher_setkey(ablkctx->sw_cipher, key, keylen); } static int chcr_aes_cbc_setkey(struct crypto_skcipher *cipher, @@ -3302,9 +3295,6 @@ static int chcr_aead_ccm_setkey(struct crypto_aead *aead, crypto_aead_set_flags(aeadctx->sw_cipher, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); error = crypto_aead_setkey(aeadctx->sw_cipher, key, keylen); - crypto_aead_clear_flags(aead, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(aead, crypto_aead_get_flags(aeadctx->sw_cipher) & - CRYPTO_TFM_RES_MASK); if (error) return error; return chcr_ccm_common_setkey(aead, key, keylen); @@ -3324,9 +3314,6 @@ static int chcr_aead_rfc4309_setkey(struct crypto_aead *aead, const u8 *key, crypto_aead_set_flags(aeadctx->sw_cipher, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); error = crypto_aead_setkey(aeadctx->sw_cipher, key, keylen); - crypto_aead_clear_flags(aead, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(aead, crypto_aead_get_flags(aeadctx->sw_cipher) & - CRYPTO_TFM_RES_MASK); if (error) return error; keylen -= 3; @@ -3348,9 +3335,6 @@ static int chcr_gcm_setkey(struct crypto_aead *aead, const u8 *key, crypto_aead_set_flags(aeadctx->sw_cipher, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); ret = crypto_aead_setkey(aeadctx->sw_cipher, key, keylen); - crypto_aead_clear_flags(aead, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(aead, crypto_aead_get_flags(aeadctx->sw_cipher) & - CRYPTO_TFM_RES_MASK); if (ret) goto out; @@ -3416,9 +3400,6 @@ static int chcr_authenc_setkey(struct crypto_aead *authenc, const u8 *key, crypto_aead_set_flags(aeadctx->sw_cipher, crypto_aead_get_flags(authenc) & CRYPTO_TFM_REQ_MASK); err = crypto_aead_setkey(aeadctx->sw_cipher, key, keylen); - crypto_aead_clear_flags(authenc, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(authenc, crypto_aead_get_flags(aeadctx->sw_cipher) - & CRYPTO_TFM_RES_MASK); if (err) goto out; @@ -3544,9 +3525,6 @@ static int chcr_aead_digest_null_setkey(struct crypto_aead *authenc, crypto_aead_set_flags(aeadctx->sw_cipher, crypto_aead_get_flags(authenc) & CRYPTO_TFM_REQ_MASK); err = crypto_aead_setkey(aeadctx->sw_cipher, key, keylen); - crypto_aead_clear_flags(authenc, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(authenc, crypto_aead_get_flags(aeadctx->sw_cipher) - & CRYPTO_TFM_RES_MASK); if (err) goto out; diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c index eb6e6b618361..f4f18bfc2247 100644 --- a/drivers/crypto/geode-aes.c +++ b/drivers/crypto/geode-aes.c @@ -110,7 +110,6 @@ static int geode_setkey_cip(struct crypto_tfm *tfm, const u8 *key, unsigned int len) { struct geode_aes_tfm_ctx *tctx = crypto_tfm_ctx(tfm); - unsigned int ret; tctx->keylen = len; @@ -130,20 +129,13 @@ static int geode_setkey_cip(struct crypto_tfm *tfm, const u8 *key, tctx->fallback.cip->base.crt_flags |= (tfm->crt_flags & CRYPTO_TFM_REQ_MASK); - ret = crypto_cipher_setkey(tctx->fallback.cip, key, len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (tctx->fallback.cip->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } - return ret; + return crypto_cipher_setkey(tctx->fallback.cip, key, len); } static int geode_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { struct geode_aes_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); - unsigned int ret; tctx->keylen = len; @@ -164,11 +156,7 @@ static int geode_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, crypto_skcipher_set_flags(tctx->fallback.skcipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - ret = crypto_skcipher_setkey(tctx->fallback.skcipher, key, len); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(tctx->fallback.skcipher) & - CRYPTO_TFM_RES_MASK); - return ret; + return crypto_skcipher_setkey(tctx->fallback.skcipher, key, len); } static void diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index 5ee66532f336..0c5e80c3f6e3 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -499,9 +499,6 @@ static int safexcel_aead_setkey(struct crypto_aead *ctfm, const u8 *key, goto badkey; } - crypto_aead_set_flags(ctfm, crypto_aead_get_flags(ctfm) & - CRYPTO_TFM_RES_MASK); - if (priv->flags & EIP197_TRC_CACHE && ctx->base.ctxr_dma && (memcmp(ctx->ipad, istate.state, ctx->state_sz) || memcmp(ctx->opad, ostate.state, ctx->state_sz))) @@ -2583,8 +2580,6 @@ static int safexcel_aead_gcm_setkey(struct crypto_aead *ctfm, const u8 *key, crypto_cipher_set_flags(ctx->hkaes, crypto_aead_get_flags(ctfm) & CRYPTO_TFM_REQ_MASK); ret = crypto_cipher_setkey(ctx->hkaes, key, len); - crypto_aead_set_flags(ctfm, crypto_cipher_get_flags(ctx->hkaes) & - CRYPTO_TFM_RES_MASK); if (ret) return ret; diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 088d7f8aab5e..43962bc709c6 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -2069,8 +2069,6 @@ static int safexcel_xcbcmac_setkey(struct crypto_ahash *tfm, const u8 *key, crypto_cipher_set_flags(ctx->kaes, crypto_ahash_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); ret = crypto_cipher_setkey(ctx->kaes, key, len); - crypto_ahash_set_flags(tfm, crypto_cipher_get_flags(ctx->kaes) & - CRYPTO_TFM_RES_MASK); if (ret) return ret; @@ -2090,8 +2088,6 @@ static int safexcel_xcbcmac_setkey(struct crypto_ahash *tfm, const u8 *key, ret = crypto_cipher_setkey(ctx->kaes, (u8 *)key_tmp + 2 * AES_BLOCK_SIZE, AES_MIN_KEY_SIZE); - crypto_ahash_set_flags(tfm, crypto_cipher_get_flags(ctx->kaes) & - CRYPTO_TFM_RES_MASK); if (ret) return ret; @@ -2174,8 +2170,6 @@ static int safexcel_cmac_setkey(struct crypto_ahash *tfm, const u8 *key, crypto_cipher_set_flags(ctx->kaes, crypto_ahash_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); ret = crypto_cipher_setkey(ctx->kaes, key, len); - crypto_ahash_set_flags(tfm, crypto_cipher_get_flags(ctx->kaes) & - CRYPTO_TFM_RES_MASK); if (ret) return ret; diff --git a/drivers/crypto/mediatek/mtk-aes.c b/drivers/crypto/mediatek/mtk-aes.c index 00e580bf8536..78d660d963e2 100644 --- a/drivers/crypto/mediatek/mtk-aes.c +++ b/drivers/crypto/mediatek/mtk-aes.c @@ -1031,8 +1031,6 @@ static int mtk_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); - crypto_aead_set_flags(aead, crypto_skcipher_get_flags(ctr) & - CRYPTO_TFM_RES_MASK); if (err) return err; diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index f438b425c655..435ac1c83df9 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -492,7 +492,6 @@ static int mxs_dcp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { struct dcp_async_ctx *actx = crypto_skcipher_ctx(tfm); - unsigned int ret; /* * AES 128 is supposed by the hardware, store key into temporary @@ -513,16 +512,7 @@ static int mxs_dcp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, crypto_sync_skcipher_clear_flags(actx->fallback, CRYPTO_TFM_REQ_MASK); crypto_sync_skcipher_set_flags(actx->fallback, tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK); - - ret = crypto_sync_skcipher_setkey(actx->fallback, key, len); - if (!ret) - return 0; - - tfm->base.crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->base.crt_flags |= crypto_sync_skcipher_get_flags(actx->fallback) & - CRYPTO_TFM_RES_MASK; - - return ret; + return crypto_sync_skcipher_setkey(actx->fallback, key, len); } static int mxs_dcp_aes_fallback_init_tfm(struct crypto_skcipher *tfm) diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c index ced4cbed9ea0..7384e91c8b32 100644 --- a/drivers/crypto/picoxcell_crypto.c +++ b/drivers/crypto/picoxcell_crypto.c @@ -465,9 +465,6 @@ static int spacc_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_aead_set_flags(ctx->sw_cipher, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_aead_setkey(ctx->sw_cipher, key, keylen); - crypto_aead_clear_flags(tfm, CRYPTO_TFM_RES_MASK); - crypto_aead_set_flags(tfm, crypto_aead_get_flags(ctx->sw_cipher) & - CRYPTO_TFM_RES_MASK); if (err) return err; @@ -802,12 +799,6 @@ static int spacc_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, CRYPTO_TFM_REQ_MASK); err = crypto_sync_skcipher_setkey(ctx->sw_cipher, key, len); - - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= - crypto_sync_skcipher_get_flags(ctx->sw_cipher) & - CRYPTO_TFM_RES_MASK; - if (err) goto sw_setkey_failed; } diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c index d4ea2f11ca68..466e30bd529c 100644 --- a/drivers/crypto/sahara.c +++ b/drivers/crypto/sahara.c @@ -601,7 +601,6 @@ static int sahara_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct sahara_ctx *ctx = crypto_skcipher_ctx(tfm); - int ret; ctx->keylen = keylen; @@ -621,13 +620,7 @@ static int sahara_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, crypto_sync_skcipher_clear_flags(ctx->fallback, CRYPTO_TFM_REQ_MASK); crypto_sync_skcipher_set_flags(ctx->fallback, tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK); - - ret = crypto_sync_skcipher_setkey(ctx->fallback, key, keylen); - - tfm->base.crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->base.crt_flags |= crypto_sync_skcipher_get_flags(ctx->fallback) & - CRYPTO_TFM_RES_MASK; - return ret; + return crypto_sync_skcipher_setkey(ctx->fallback, key, keylen); } static int sahara_aes_crypt(struct skcipher_request *req, unsigned long mode) diff --git a/include/linux/crypto.h b/include/linux/crypto.h index accd0c8038fd..763863dbc079 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -107,8 +107,6 @@ #define CRYPTO_TFM_NEED_KEY 0x00000001 #define CRYPTO_TFM_REQ_MASK 0x000fff00 -#define CRYPTO_TFM_RES_MASK 0xfff00000 - #define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS 0x00000100 #define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 #define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 -- cgit v1.2.3 From 1c08a104360f3e18f4ee6346c21cc3923efb952e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 5 Jan 2020 22:40:46 -0500 Subject: crypto: poly1305 - add new 32 and 64-bit generic versions These two C implementations from Zinc -- a 32x32 one and a 64x64 one, depending on the platform -- come from Andrew Moon's public domain poly1305-donna portable code, modified for usage in the kernel. The precomputation in the 32-bit version and the use of 64x64 multiplies in the 64-bit version make these perform better than the code it replaces. Moon's code is also very widespread and has received many eyeballs of scrutiny. There's a bit of interference between the x86 implementation, which relies on internal details of the old scalar implementation. In the next commit, the x86 implementation will be replaced with a faster one that doesn't rely on this, so none of this matters much. But for now, to keep this passing the tests, we inline the bits of the old implementation that the x86 implementation relied on. Also, since we now support a slightly larger key space, via the union, some offsets had to be fixed up. Nonce calculation was folded in with the emit function, to take advantage of 64x64 arithmetic. However, Adiantum appeared to rely on no nonce handling in emit, so this path was conditionalized. We also introduced a new struct, poly1305_core_key, to represent the precise amount of space that particular implementation uses. Testing with kbench9000, depending on the CPU, the update function for the 32x32 version has been improved by 4%-7%, and for the 64x64 by 19%-30%. The 32x32 gains are small, but I think there's great value in having a parallel implementation to the 64x64 one so that the two can be compared side-by-side as nice stand-alone units. Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305-avx2-x86_64.S | 20 +-- arch/x86/crypto/poly1305_glue.c | 215 +++++++++++++++++++++++++++++++-- crypto/adiantum.c | 4 +- crypto/nhpoly1305.c | 2 +- crypto/poly1305_generic.c | 25 +++- include/crypto/internal/poly1305.h | 45 ++----- include/crypto/nhpoly1305.h | 4 +- include/crypto/poly1305.h | 26 +++- lib/crypto/Makefile | 4 +- lib/crypto/poly1305-donna32.c | 204 +++++++++++++++++++++++++++++++ lib/crypto/poly1305-donna64.c | 185 ++++++++++++++++++++++++++++ lib/crypto/poly1305.c | 169 ++------------------------ 12 files changed, 675 insertions(+), 228 deletions(-) create mode 100644 lib/crypto/poly1305-donna32.c create mode 100644 lib/crypto/poly1305-donna64.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index d6063feda9da..8f56989ea599 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -34,16 +34,16 @@ ORMASK: .octa 0x00000000010000000000000001000000 #define u2 0x08(%r8) #define u3 0x0c(%r8) #define u4 0x10(%r8) -#define w0 0x14(%r8) -#define w1 0x18(%r8) -#define w2 0x1c(%r8) -#define w3 0x20(%r8) -#define w4 0x24(%r8) -#define y0 0x28(%r8) -#define y1 0x2c(%r8) -#define y2 0x30(%r8) -#define y3 0x34(%r8) -#define y4 0x38(%r8) +#define w0 0x18(%r8) +#define w1 0x1c(%r8) +#define w2 0x20(%r8) +#define w3 0x24(%r8) +#define w4 0x28(%r8) +#define y0 0x30(%r8) +#define y1 0x34(%r8) +#define y2 0x38(%r8) +#define y3 0x3c(%r8) +#define y4 0x40(%r8) #define m %rsi #define hc0 %ymm0 #define hc1 %ymm1 diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 0cc4537e6617..edb7113e36f3 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -25,6 +25,21 @@ asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static inline u64 mlt(u64 a, u64 b) +{ + return a * b; +} + +static inline u32 sr(u64 v, u_char n) +{ + return v >> n; +} + +static inline u32 and(u32 v, u32 mask) +{ + return v & mask; +} + static void poly1305_simd_mult(u32 *a, const u32 *b) { u8 m[POLY1305_BLOCK_SIZE]; @@ -36,6 +51,168 @@ static void poly1305_simd_mult(u32 *a, const u32 *b) poly1305_block_sse2(a, m, b, 1); } +static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key) +{ + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; + key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; + key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; + key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; + key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; +} + +static void poly1305_integer_blocks(struct poly1305_state *state, + const struct poly1305_key *key, + const void *src, + unsigned int nblocks, u32 hibit) +{ + u32 r0, r1, r2, r3, r4; + u32 s1, s2, s3, s4; + u32 h0, h1, h2, h3, h4; + u64 d0, d1, d2, d3, d4; + + if (!nblocks) + return; + + r0 = key->r[0]; + r1 = key->r[1]; + r2 = key->r[2]; + r3 = key->r[3]; + r4 = key->r[4]; + + s1 = r1 * 5; + s2 = r2 * 5; + s3 = r3 * 5; + s4 = r4 * 5; + + h0 = state->h[0]; + h1 = state->h[1]; + h2 = state->h[2]; + h3 = state->h[3]; + h4 = state->h[4]; + + do { + /* h += m[i] */ + h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; + h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; + h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; + h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; + h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); + + /* h *= r */ + d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + + mlt(h3, s2) + mlt(h4, s1); + d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + + mlt(h3, s3) + mlt(h4, s2); + d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + + mlt(h3, s4) + mlt(h4, s3); + d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + + mlt(h3, r0) + mlt(h4, s4); + d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + + mlt(h3, r1) + mlt(h4, r0); + + /* (partial) h %= p */ + d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); + d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); + d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); + d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); + h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); + h1 += h0 >> 26; h0 = h0 & 0x3ffffff; + + src += POLY1305_BLOCK_SIZE; + } while (--nblocks); + + state->h[0] = h0; + state->h[1] = h1; + state->h[2] = h2; + state->h[3] = h3; + state->h[4] = h4; +} + +static void poly1305_integer_emit(const struct poly1305_state *state, void *dst) +{ + u32 h0, h1, h2, h3, h4; + u32 g0, g1, g2, g3, g4; + u32 mask; + + /* fully carry h */ + h0 = state->h[0]; + h1 = state->h[1]; + h2 = state->h[2]; + h3 = state->h[3]; + h4 = state->h[4]; + + h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; + h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; + h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; + h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; + h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; + + /* compute h + -p */ + g0 = h0 + 5; + g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; + g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; + g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; + g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); + put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); + put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); + put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); +} + +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) +{ + poly1305_integer_setkey(desc->opaque_r, key); + desc->s[0] = get_unaligned_le32(key + 16); + desc->s[1] = get_unaligned_le32(key + 20); + desc->s[2] = get_unaligned_le32(key + 24); + desc->s[3] = get_unaligned_le32(key + 28); + poly1305_core_init(&desc->h); + desc->buflen = 0; + desc->sset = true; + desc->rset = 1; +} +EXPORT_SYMBOL_GPL(poly1305_init_arch); + +static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + if (!dctx->sset) { + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { + poly1305_integer_setkey(dctx->r, src); + src += POLY1305_BLOCK_SIZE; + srclen -= POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + srclen -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + } + return srclen; +} + static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { @@ -47,8 +224,8 @@ static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, srclen = datalen; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_core_blocks(&dctx->h, dctx->r, src, - srclen / POLY1305_BLOCK_SIZE, 1); + poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src, + srclen / POLY1305_BLOCK_SIZE, 1); srclen %= POLY1305_BLOCK_SIZE; } return srclen; @@ -105,12 +282,6 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, return srclen; } -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) -{ - poly1305_init_generic(desc, key); -} -EXPORT_SYMBOL(poly1305_init_arch); - void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { @@ -158,9 +329,31 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, } EXPORT_SYMBOL(poly1305_update_arch); -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest) +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst) { - poly1305_final_generic(desc, digest); + __le32 digest[4]; + u64 f = 0; + + if (unlikely(desc->buflen)) { + desc->buf[desc->buflen++] = 1; + memset(desc->buf + desc->buflen, 0, + POLY1305_BLOCK_SIZE - desc->buflen); + poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0); + } + + poly1305_integer_emit(&desc->h, digest); + + /* mac = (h + s) % (2^128) */ + f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; + put_unaligned_le32(f, dst + 0); + f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; + put_unaligned_le32(f, dst + 4); + f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; + put_unaligned_le32(f, dst + 8); + f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; + put_unaligned_le32(f, dst + 12); + + *desc = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final_arch); @@ -183,7 +376,7 @@ static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) if (unlikely(!dctx->sset)) return -ENOKEY; - poly1305_final_generic(dctx, dst); + poly1305_final_arch(dctx, dst); return 0; } diff --git a/crypto/adiantum.c b/crypto/adiantum.c index 4d7a6cac82ed..53d5e705a425 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -70,7 +70,7 @@ struct adiantum_tfm_ctx { struct crypto_skcipher *streamcipher; struct crypto_cipher *blockcipher; struct crypto_shash *hash; - struct poly1305_key header_hash_key; + struct poly1305_core_key header_hash_key; }; struct adiantum_request_ctx { @@ -239,7 +239,7 @@ static void adiantum_hash_header(struct skcipher_request *req) poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv, TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1); - poly1305_core_emit(&state, &rctx->header_hash); + poly1305_core_emit(&state, NULL, &rctx->header_hash); } /* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */ diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c index f6b6a52092b4..8a3006c3b51b 100644 --- a/crypto/nhpoly1305.c +++ b/crypto/nhpoly1305.c @@ -210,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn) if (state->nh_remaining) process_nh_hash_value(state, key); - poly1305_core_emit(&state->poly_state, dst); + poly1305_core_emit(&state->poly_state, NULL, dst); return 0; } EXPORT_SYMBOL(crypto_nhpoly1305_final_helper); diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c index 21edbd8c99fb..94af47eb6fa6 100644 --- a/crypto/poly1305_generic.c +++ b/crypto/poly1305_generic.c @@ -31,6 +31,29 @@ static int crypto_poly1305_init(struct shash_desc *desc) return 0; } +static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + if (!dctx->sset) { + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { + poly1305_core_setkey(&dctx->core_r, src); + src += POLY1305_BLOCK_SIZE; + srclen -= POLY1305_BLOCK_SIZE; + dctx->rset = 2; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + srclen -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + } + return srclen; +} + static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { @@ -42,7 +65,7 @@ static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, srclen = datalen; } - poly1305_core_blocks(&dctx->h, dctx->r, src, + poly1305_core_blocks(&dctx->h, &dctx->core_r, src, srclen / POLY1305_BLOCK_SIZE, 1); } diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h index 479b0cab2a1a..064e52ca5248 100644 --- a/include/crypto/internal/poly1305.h +++ b/include/crypto/internal/poly1305.h @@ -11,48 +11,23 @@ #include /* - * Poly1305 core functions. These implement the ε-almost-∆-universal hash - * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce - * ("s key") at the end. They also only support block-aligned inputs. + * Poly1305 core functions. These only accept whole blocks; the caller must + * handle any needed block buffering and padding. 'hibit' must be 1 for any + * full blocks, or 0 for the final block if it had to be padded. If 'nonce' is + * non-NULL, then it's added at the end to compute the Poly1305 MAC. Otherwise, + * only the ε-almost-∆-universal hash function (not the full MAC) is computed. */ -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key); + +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 *raw_key); static inline void poly1305_core_init(struct poly1305_state *state) { *state = (struct poly1305_state){}; } void poly1305_core_blocks(struct poly1305_state *state, - const struct poly1305_key *key, const void *src, + const struct poly1305_core_key *key, const void *src, unsigned int nblocks, u32 hibit); -void poly1305_core_emit(const struct poly1305_state *state, void *dst); - -/* - * Poly1305 requires a unique key for each tag, which implies that we can't set - * it on the tfm that gets accessed by multiple users simultaneously. Instead we - * expect the key as the first 32 bytes in the update() call. - */ -static inline -unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - if (!dctx->sset) { - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { - poly1305_core_setkey(dctx->r, src); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(src + 0); - dctx->s[1] = get_unaligned_le32(src + 4); - dctx->s[2] = get_unaligned_le32(src + 8); - dctx->s[3] = get_unaligned_le32(src + 12); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - } - return srclen; -} +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], + void *dst); #endif diff --git a/include/crypto/nhpoly1305.h b/include/crypto/nhpoly1305.h index 53c04423c582..306925fea190 100644 --- a/include/crypto/nhpoly1305.h +++ b/include/crypto/nhpoly1305.h @@ -7,7 +7,7 @@ #define _NHPOLY1305_H #include -#include +#include /* NH parameterization: */ @@ -33,7 +33,7 @@ #define NHPOLY1305_KEY_SIZE (POLY1305_BLOCK_SIZE + NH_KEY_BYTES) struct nhpoly1305_key { - struct poly1305_key poly_key; + struct poly1305_core_key poly_key; u32 nh_key[NH_KEY_WORDS]; }; diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index 74c6e1cd73ee..f1f67fc749cf 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -13,12 +13,29 @@ #define POLY1305_KEY_SIZE 32 #define POLY1305_DIGEST_SIZE 16 +/* The poly1305_key and poly1305_state types are mostly opaque and + * implementation-defined. Limbs might be in base 2^64 or base 2^26, or + * different yet. The union type provided keeps these 64-bit aligned for the + * case in which this is implemented using 64x64 multiplies. + */ + struct poly1305_key { - u32 r[5]; /* key, base 2^26 */ + union { + u32 r[5]; + u64 r64[3]; + }; +}; + +struct poly1305_core_key { + struct poly1305_key key; + struct poly1305_key precomputed_s; }; struct poly1305_state { - u32 h[5]; /* accumulator, base 2^26 */ + union { + u32 h[5]; + u64 h64[3]; + }; }; struct poly1305_desc_ctx { @@ -35,7 +52,10 @@ struct poly1305_desc_ctx { /* accumulator */ struct poly1305_state h; /* key */ - struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE]; + union { + struct poly1305_key opaque_r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE]; + struct poly1305_core_key core_r; + }; }; void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key); diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index f97f9b941110..6ecaf83a5a9a 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -28,7 +28,9 @@ obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305.o -libpoly1305-y := poly1305.o +libpoly1305-y := poly1305-donna32.o +libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o +libpoly1305-y += poly1305.o obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o libsha256-y := sha256.o diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c new file mode 100644 index 000000000000..3cc77d94390b --- /dev/null +++ b/lib/crypto/poly1305-donna32.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * This is based in part on Andrew Moon's poly1305-donna, which is in the + * public domain. + */ + +#include +#include +#include + +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16]) +{ + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + key->key.r[0] = (get_unaligned_le32(&raw_key[0])) & 0x3ffffff; + key->key.r[1] = (get_unaligned_le32(&raw_key[3]) >> 2) & 0x3ffff03; + key->key.r[2] = (get_unaligned_le32(&raw_key[6]) >> 4) & 0x3ffc0ff; + key->key.r[3] = (get_unaligned_le32(&raw_key[9]) >> 6) & 0x3f03fff; + key->key.r[4] = (get_unaligned_le32(&raw_key[12]) >> 8) & 0x00fffff; + + /* s = 5*r */ + key->precomputed_s.r[0] = key->key.r[1] * 5; + key->precomputed_s.r[1] = key->key.r[2] * 5; + key->precomputed_s.r[2] = key->key.r[3] * 5; + key->precomputed_s.r[3] = key->key.r[4] * 5; +} +EXPORT_SYMBOL(poly1305_core_setkey); + +void poly1305_core_blocks(struct poly1305_state *state, + const struct poly1305_core_key *key, const void *src, + unsigned int nblocks, u32 hibit) +{ + const u8 *input = src; + u32 r0, r1, r2, r3, r4; + u32 s1, s2, s3, s4; + u32 h0, h1, h2, h3, h4; + u64 d0, d1, d2, d3, d4; + u32 c; + + if (!nblocks) + return; + + hibit <<= 24; + + r0 = key->key.r[0]; + r1 = key->key.r[1]; + r2 = key->key.r[2]; + r3 = key->key.r[3]; + r4 = key->key.r[4]; + + s1 = key->precomputed_s.r[0]; + s2 = key->precomputed_s.r[1]; + s3 = key->precomputed_s.r[2]; + s4 = key->precomputed_s.r[3]; + + h0 = state->h[0]; + h1 = state->h[1]; + h2 = state->h[2]; + h3 = state->h[3]; + h4 = state->h[4]; + + do { + /* h += m[i] */ + h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff; + h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff; + h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff; + h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff; + h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit; + + /* h *= r */ + d0 = ((u64)h0 * r0) + ((u64)h1 * s4) + + ((u64)h2 * s3) + ((u64)h3 * s2) + + ((u64)h4 * s1); + d1 = ((u64)h0 * r1) + ((u64)h1 * r0) + + ((u64)h2 * s4) + ((u64)h3 * s3) + + ((u64)h4 * s2); + d2 = ((u64)h0 * r2) + ((u64)h1 * r1) + + ((u64)h2 * r0) + ((u64)h3 * s4) + + ((u64)h4 * s3); + d3 = ((u64)h0 * r3) + ((u64)h1 * r2) + + ((u64)h2 * r1) + ((u64)h3 * r0) + + ((u64)h4 * s4); + d4 = ((u64)h0 * r4) + ((u64)h1 * r3) + + ((u64)h2 * r2) + ((u64)h3 * r1) + + ((u64)h4 * r0); + + /* (partial) h %= p */ + c = (u32)(d0 >> 26); + h0 = (u32)d0 & 0x3ffffff; + d1 += c; + c = (u32)(d1 >> 26); + h1 = (u32)d1 & 0x3ffffff; + d2 += c; + c = (u32)(d2 >> 26); + h2 = (u32)d2 & 0x3ffffff; + d3 += c; + c = (u32)(d3 >> 26); + h3 = (u32)d3 & 0x3ffffff; + d4 += c; + c = (u32)(d4 >> 26); + h4 = (u32)d4 & 0x3ffffff; + h0 += c * 5; + c = (h0 >> 26); + h0 = h0 & 0x3ffffff; + h1 += c; + + input += POLY1305_BLOCK_SIZE; + } while (--nblocks); + + state->h[0] = h0; + state->h[1] = h1; + state->h[2] = h2; + state->h[3] = h3; + state->h[4] = h4; +} +EXPORT_SYMBOL(poly1305_core_blocks); + +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], + void *dst) +{ + u8 *mac = dst; + u32 h0, h1, h2, h3, h4, c; + u32 g0, g1, g2, g3, g4; + u64 f; + u32 mask; + + /* fully carry h */ + h0 = state->h[0]; + h1 = state->h[1]; + h2 = state->h[2]; + h3 = state->h[3]; + h4 = state->h[4]; + + c = h1 >> 26; + h1 = h1 & 0x3ffffff; + h2 += c; + c = h2 >> 26; + h2 = h2 & 0x3ffffff; + h3 += c; + c = h3 >> 26; + h3 = h3 & 0x3ffffff; + h4 += c; + c = h4 >> 26; + h4 = h4 & 0x3ffffff; + h0 += c * 5; + c = h0 >> 26; + h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = g0 >> 26; + g0 &= 0x3ffffff; + g1 = h1 + c; + c = g1 >> 26; + g1 &= 0x3ffffff; + g2 = h2 + c; + c = g2 >> 26; + g2 &= 0x3ffffff; + g3 = h3 + c; + c = g3 >> 26; + g3 &= 0x3ffffff; + g4 = h4 + c - (1UL << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + if (likely(nonce)) { + /* mac = (h + nonce) % (2^128) */ + f = (u64)h0 + nonce[0]; + h0 = (u32)f; + f = (u64)h1 + nonce[1] + (f >> 32); + h1 = (u32)f; + f = (u64)h2 + nonce[2] + (f >> 32); + h2 = (u32)f; + f = (u64)h3 + nonce[3] + (f >> 32); + h3 = (u32)f; + } + + put_unaligned_le32(h0, &mac[0]); + put_unaligned_le32(h1, &mac[4]); + put_unaligned_le32(h2, &mac[8]); + put_unaligned_le32(h3, &mac[12]); +} +EXPORT_SYMBOL(poly1305_core_emit); diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c new file mode 100644 index 000000000000..6ae181bb4345 --- /dev/null +++ b/lib/crypto/poly1305-donna64.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * This is based in part on Andrew Moon's poly1305-donna, which is in the + * public domain. + */ + +#include +#include +#include + +typedef __uint128_t u128; + +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16]) +{ + u64 t0, t1; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + t0 = get_unaligned_le64(&raw_key[0]); + t1 = get_unaligned_le64(&raw_key[8]); + + key->key.r64[0] = t0 & 0xffc0fffffffULL; + key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL; + key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL; + + /* s = 20*r */ + key->precomputed_s.r64[0] = key->key.r64[1] * 20; + key->precomputed_s.r64[1] = key->key.r64[2] * 20; +} +EXPORT_SYMBOL(poly1305_core_setkey); + +void poly1305_core_blocks(struct poly1305_state *state, + const struct poly1305_core_key *key, const void *src, + unsigned int nblocks, u32 hibit) +{ + const u8 *input = src; + u64 hibit64; + u64 r0, r1, r2; + u64 s1, s2; + u64 h0, h1, h2; + u64 c; + u128 d0, d1, d2, d; + + if (!nblocks) + return; + + hibit64 = ((u64)hibit) << 40; + + r0 = key->key.r64[0]; + r1 = key->key.r64[1]; + r2 = key->key.r64[2]; + + h0 = state->h64[0]; + h1 = state->h64[1]; + h2 = state->h64[2]; + + s1 = key->precomputed_s.r64[0]; + s2 = key->precomputed_s.r64[1]; + + do { + u64 t0, t1; + + /* h += m[i] */ + t0 = get_unaligned_le64(&input[0]); + t1 = get_unaligned_le64(&input[8]); + + h0 += t0 & 0xfffffffffffULL; + h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL; + h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64; + + /* h *= r */ + d0 = (u128)h0 * r0; + d = (u128)h1 * s2; + d0 += d; + d = (u128)h2 * s1; + d0 += d; + d1 = (u128)h0 * r1; + d = (u128)h1 * r0; + d1 += d; + d = (u128)h2 * s2; + d1 += d; + d2 = (u128)h0 * r2; + d = (u128)h1 * r1; + d2 += d; + d = (u128)h2 * r0; + d2 += d; + + /* (partial) h %= p */ + c = (u64)(d0 >> 44); + h0 = (u64)d0 & 0xfffffffffffULL; + d1 += c; + c = (u64)(d1 >> 44); + h1 = (u64)d1 & 0xfffffffffffULL; + d2 += c; + c = (u64)(d2 >> 42); + h2 = (u64)d2 & 0x3ffffffffffULL; + h0 += c * 5; + c = h0 >> 44; + h0 = h0 & 0xfffffffffffULL; + h1 += c; + + input += POLY1305_BLOCK_SIZE; + } while (--nblocks); + + state->h64[0] = h0; + state->h64[1] = h1; + state->h64[2] = h2; +} +EXPORT_SYMBOL(poly1305_core_blocks); + +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], + void *dst) +{ + u8 *mac = dst; + u64 h0, h1, h2, c; + u64 g0, g1, g2; + u64 t0, t1; + + /* fully carry h */ + h0 = state->h64[0]; + h1 = state->h64[1]; + h2 = state->h64[2]; + + c = h1 >> 44; + h1 &= 0xfffffffffffULL; + h2 += c; + c = h2 >> 42; + h2 &= 0x3ffffffffffULL; + h0 += c * 5; + c = h0 >> 44; + h0 &= 0xfffffffffffULL; + h1 += c; + c = h1 >> 44; + h1 &= 0xfffffffffffULL; + h2 += c; + c = h2 >> 42; + h2 &= 0x3ffffffffffULL; + h0 += c * 5; + c = h0 >> 44; + h0 &= 0xfffffffffffULL; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = g0 >> 44; + g0 &= 0xfffffffffffULL; + g1 = h1 + c; + c = g1 >> 44; + g1 &= 0xfffffffffffULL; + g2 = h2 + c - (1ULL << 42); + + /* select h if h < p, or h + -p if h >= p */ + c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1; + g0 &= c; + g1 &= c; + g2 &= c; + c = ~c; + h0 = (h0 & c) | g0; + h1 = (h1 & c) | g1; + h2 = (h2 & c) | g2; + + if (likely(nonce)) { + /* h = (h + nonce) */ + t0 = ((u64)nonce[1] << 32) | nonce[0]; + t1 = ((u64)nonce[3] << 32) | nonce[2]; + + h0 += t0 & 0xfffffffffffULL; + c = h0 >> 44; + h0 &= 0xfffffffffffULL; + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c; + c = h1 >> 44; + h1 &= 0xfffffffffffULL; + h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c; + h2 &= 0x3ffffffffffULL; + } + + /* mac = h % (2^128) */ + h0 = h0 | (h1 << 44); + h1 = (h1 >> 20) | (h2 << 24); + + put_unaligned_le64(h0, &mac[0]); + put_unaligned_le64(h1, &mac[8]); +} +EXPORT_SYMBOL(poly1305_core_emit); diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index 32ec293c65ae..9d2d14df0fee 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -12,151 +12,9 @@ #include #include -static inline u64 mlt(u64 a, u64 b) -{ - return a * b; -} - -static inline u32 sr(u64 v, u_char n) -{ - return v >> n; -} - -static inline u32 and(u32 v, u32 mask) -{ - return v & mask; -} - -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key) -{ - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; -} -EXPORT_SYMBOL_GPL(poly1305_core_setkey); - -void poly1305_core_blocks(struct poly1305_state *state, - const struct poly1305_key *key, const void *src, - unsigned int nblocks, u32 hibit) -{ - u32 r0, r1, r2, r3, r4; - u32 s1, s2, s3, s4; - u32 h0, h1, h2, h3, h4; - u64 d0, d1, d2, d3, d4; - - if (!nblocks) - return; - - r0 = key->r[0]; - r1 = key->r[1]; - r2 = key->r[2]; - r3 = key->r[3]; - r4 = key->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - do { - /* h += m[i] */ - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); - - /* h *= r */ - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + - mlt(h3, s2) + mlt(h4, s1); - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + - mlt(h3, s3) + mlt(h4, s2); - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + - mlt(h3, s4) + mlt(h4, s3); - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + - mlt(h3, r0) + mlt(h4, s4); - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + - mlt(h3, r1) + mlt(h4, r0); - - /* (partial) h %= p */ - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; - - src += POLY1305_BLOCK_SIZE; - } while (--nblocks); - - state->h[0] = h0; - state->h[1] = h1; - state->h[2] = h2; - state->h[3] = h3; - state->h[4] = h4; -} -EXPORT_SYMBOL_GPL(poly1305_core_blocks); - -void poly1305_core_emit(const struct poly1305_state *state, void *dst) -{ - u32 h0, h1, h2, h3, h4; - u32 g0, g1, g2, g3, g4; - u32 mask; - - /* fully carry h */ - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; - - /* compute h + -p */ - g0 = h0 + 5; - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); -} -EXPORT_SYMBOL_GPL(poly1305_core_emit); - void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key) { - poly1305_core_setkey(desc->r, key); + poly1305_core_setkey(&desc->core_r, key); desc->s[0] = get_unaligned_le32(key + 16); desc->s[1] = get_unaligned_le32(key + 20); desc->s[2] = get_unaligned_le32(key + 24); @@ -164,7 +22,7 @@ void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key) poly1305_core_init(&desc->h); desc->buflen = 0; desc->sset = true; - desc->rset = 1; + desc->rset = 2; } EXPORT_SYMBOL_GPL(poly1305_init_generic); @@ -181,13 +39,14 @@ void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src, desc->buflen += bytes; if (desc->buflen == POLY1305_BLOCK_SIZE) { - poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 1); + poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, + 1, 1); desc->buflen = 0; } } if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { - poly1305_core_blocks(&desc->h, desc->r, src, + poly1305_core_blocks(&desc->h, &desc->core_r, src, nbytes / POLY1305_BLOCK_SIZE, 1); src += nbytes - (nbytes % POLY1305_BLOCK_SIZE); nbytes %= POLY1305_BLOCK_SIZE; @@ -202,28 +61,14 @@ EXPORT_SYMBOL_GPL(poly1305_update_generic); void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst) { - __le32 digest[4]; - u64 f = 0; - if (unlikely(desc->buflen)) { desc->buf[desc->buflen++] = 1; memset(desc->buf + desc->buflen, 0, POLY1305_BLOCK_SIZE - desc->buflen); - poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 0); + poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0); } - poly1305_core_emit(&desc->h, digest); - - /* mac = (h + s) % (2^128) */ - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; - put_unaligned_le32(f, dst + 0); - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; - put_unaligned_le32(f, dst + 4); - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; - put_unaligned_le32(f, dst + 8); - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; - put_unaligned_le32(f, dst + 12); - + poly1305_core_emit(&desc->h, desc->s, dst); *desc = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL_GPL(poly1305_final_generic); -- cgit v1.2.3 From 0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 5 Jan 2020 22:40:47 -0500 Subject: crypto: x86/poly1305 - import unmodified cryptogams implementation These x86_64 vectorized implementations come from Andy Polyakov's CRYPTOGAMS implementation, and are included here in raw form without modification, so that subsequent commits that fix these up for the kernel can see how it has changed. Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 4159 +++++++++++++++++++++++++ 1 file changed, 4159 insertions(+) create mode 100644 arch/x86/crypto/poly1305-x86_64-cryptogams.pl (limited to 'arch/x86') diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl new file mode 100644 index 000000000000..342ad7f18aa7 --- /dev/null +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -0,0 +1,4159 @@ +#! /usr/bin/env perl +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86_64. +# +# March 2015 +# +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannolake, has AVX512IFMA code path to execute... +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - +# Sandy Bridge 1.39/+140% 1.10 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] +# Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) +# Goldmont 1.70/+180% - +# VIA Nano 1.82/+150% - +# Sledgehammer 1.38/+160% - +# Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 +# +# (*) improvement coefficients relative to clang are more modest and +# are ~50% on most processors, in both cases we are comparing to +# __int128 code; +# (**) SSE2 implementation was attempted, but among non-AVX processors +# it was faster than integer-only code only on older Intel P4 and +# Core processors, 50-30%, less newer processor is, but slower on +# contemporary ones, for example almost 2x slower on Atom, and as +# former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); + $avx += 2 if ($1==2.11 && $2>=8); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=12); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); +my ($mac,$nonce)=($inp,$len); # *_emit arguments +my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); +my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); + +sub poly1305_iteration { +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 +# output: $h0-$h2 *= $r0-$r1 +$code.=<<___; + mulq $h0 # h0*r1 + mov %rax,$d2 + mov $r0,%rax + mov %rdx,$d3 + + mulq $h0 # h0*r0 + mov %rax,$h0 # future $h0 + mov $r0,%rax + mov %rdx,$d1 + + mulq $h1 # h1*r0 + add %rax,$d2 + mov $s1,%rax + adc %rdx,$d3 + + mulq $h1 # h1*s1 + mov $h2,$h1 # borrow $h1 + add %rax,$h0 + adc %rdx,$d1 + + imulq $s1,$h1 # h2*s1 + add $h1,$d2 + mov $d1,$h1 + adc \$0,$d3 + + imulq $r0,$h2 # h2*r0 + add $d2,$h1 + mov \$-4,%rax # mask value + adc $h2,$d3 + + and $d3,%rax # last reduction step + mov $d3,$h2 + shr \$2,$d3 + and \$3,$h2 + add $d3,%rax + add %rax,$h0 + adc \$0,$h1 + adc \$0,$h2 +___ +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^64 +# unsigned __int64 r[2]; # key value base 2^64 + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl poly1305_init +.hidden poly1305_init +.globl poly1305_blocks +.hidden poly1305_blocks +.globl poly1305_emit +.hidden poly1305_emit + +.type poly1305_init,\@function,3 +.align 32 +poly1305_init: + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + + cmp \$0,$inp + je .Lno_key + + lea poly1305_blocks(%rip),%r10 + lea poly1305_emit(%rip),%r11 +___ +$code.=<<___ if ($avx); + mov OPENSSL_ia32cap_P+4(%rip),%r9 + lea poly1305_blocks_avx(%rip),%rax + lea poly1305_emit_avx(%rip),%rcx + bt \$`60-32`,%r9 # AVX? + cmovc %rax,%r10 + cmovc %rcx,%r11 +___ +$code.=<<___ if ($avx>1); + lea poly1305_blocks_avx2(%rip),%rax + bt \$`5+32`,%r9 # AVX2? + cmovc %rax,%r10 +___ +$code.=<<___ if ($avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ +$code.=<<___; + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + and 8($inp),%rcx + mov %rax,24($ctx) + mov %rcx,32($ctx) +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,\@function,4 +.align 32 +poly1305_blocks: +.cfi_startproc +.Lblocks: + shr \$4,$len + jz .Lno_data # too short + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2 + + mov $s1,$r1 + shr \$2,$s1 + mov $r1,%rax + add $r1,$s1 # s1 = r1 + (r1 >> 2) + jmp .Loop + +.align 32 +.Loop: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 +___ + &poly1305_iteration(); +$code.=<<___; + mov $r1,%rax + dec %r15 # len-=16 + jnz .Loop + + mov $h0,0($ctx) # store hash value + mov $h1,8($ctx) + mov $h2,16($ctx) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data: +.Lblocks_epilogue: + ret +.cfi_endproc +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,\@function,3 +.align 32 +poly1305_emit: +.Lemit: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit,.-poly1305_emit +___ +if ($avx) { + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int64 r[2]; # key value base 2^64 +# unsigned __int64 pad; +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = + map("%xmm$_",(0..15)); + +$code.=<<___; +.type __poly1305_block,\@abi-omnipotent +.align 32 +__poly1305_block: +___ + &poly1305_iteration(); +$code.=<<___; + ret +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,\@abi-omnipotent +.align 32 +__poly1305_init_avx: + mov $r0,$h0 + mov $r1,$h1 + xor $h2,$h2 + + lea 48+64($ctx),$ctx # size optimization + + mov $r1,%rax + call __poly1305_block # r^2 + + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 + mov \$0x3ffffff,%edx + mov $h0,$d1 + and $h0#d,%eax + mov $r0,$d2 + and $r0#d,%edx + mov %eax,`16*0+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*0+4-64`($ctx) + shr \$26,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*1+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*1+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*2+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*2+4-64`($ctx) + shr \$26,$d2 + + mov $h1,%rax + mov $r1,%rdx + shl \$12,%rax + shl \$12,%rdx + or $d1,%rax + or $d2,%rdx + and \$0x3ffffff,%eax + and \$0x3ffffff,%edx + mov %eax,`16*3+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*3+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*4+0-64`($ctx) + mov $h1,$d1 + mov %edx,`16*4+4-64`($ctx) + mov $r1,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + shr \$14,$d1 + shr \$14,$d2 + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*5+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*5+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*6+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*6+4-64`($ctx) + shr \$26,$d2 + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+0-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d2#d,`16*7+4-64`($ctx) + lea ($d2,$d2,4),$d2 # *5 + mov $d1#d,`16*8+0-64`($ctx) + mov $d2#d,`16*8+4-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^3 + + mov \$0x3ffffff,%eax # save r^3 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+12-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+12-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+12-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+12-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+12-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+12-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+12-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^4 + + mov \$0x3ffffff,%eax # save r^4 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+8-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+8-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+8-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+8-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+8-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+8-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+8-64`($ctx) + + lea -48-64($ctx),$ctx # size [de-]optimization + ret +.size __poly1305_init_avx,.-__poly1305_init_avx + +.type poly1305_blocks_avx,\@function,4 +.align 32 +poly1305_blocks_avx: +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + and \$-16,$len + jz .Lno_data_avx + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx + + test \$31,$len + jz .Leven_avx + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + + call __poly1305_block + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + sub \$16,%r15 + jz .Lstore_base2_26_avx + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx: +.Lblocks_avx_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$31,$len + jz .Linit_avx + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + +.Linit_avx: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx: + mov %r15,$len + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rax + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 4*0($ctx),$H0 # load hash value + vmovd 4*1($ctx),$H1 + vmovd 4*2($ctx),$H2 + vmovd 4*3($ctx),$H3 + vmovd 4*4($ctx),$H4 + +.Ldo_avx: +___ +$code.=<<___ if (!$win64); + lea -0x58(%rsp),%r11 +.cfi_def_cfa %r11,0x60 + sub \$0x178,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x218,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx_body: +___ +$code.=<<___; + sub \$64,$len + lea -32($inp),%rax + cmovc %rax,$inp + + vmovdqu `16*3`($ctx),$D4 # preload r0^2 + lea `16*3+64`($ctx),$ctx # size optimization + lea .Lconst(%rip),%rcx + + ################################################################ + # load input + vmovdqu 16*2($inp),$T0 + vmovdqu 16*3($inp),$T1 + vmovdqa 64(%rcx),$MASK # .Lmask26 + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + vpsrlq \$40,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + jbe .Lskip_loop_avx + + # expand and copy pre-calculated table to stack + vmovdqu `16*1-64`($ctx),$D1 + vmovdqu `16*2-64`($ctx),$D2 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 + vmovdqa $D3,-0x90(%r11) + vmovdqa $D0,0x00(%rsp) + vpshufd \$0xEE,$D1,$D4 + vmovdqu `16*3-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x80(%r11) + vmovdqa $D1,0x10(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqu `16*4-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x70(%r11) + vmovdqa $D2,0x20(%rsp) + vpshufd \$0xEE,$D0,$D4 + vmovdqu `16*5-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D4,-0x60(%r11) + vmovdqa $D0,0x30(%rsp) + vpshufd \$0xEE,$D1,$D3 + vmovdqu `16*6-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D3,-0x50(%r11) + vmovdqa $D1,0x40(%rsp) + vpshufd \$0xEE,$D2,$D4 + vmovdqu `16*7-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D4,-0x40(%r11) + vmovdqa $D2,0x50(%rsp) + vpshufd \$0xEE,$D0,$D3 + vmovdqu `16*8-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D3,-0x30(%r11) + vmovdqa $D0,0x60(%rsp) + vpshufd \$0xEE,$D1,$D4 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x20(%r11) + vmovdqa $D1,0x70(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x10(%r11) + vmovdqa $D2,0x80(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + # + # Note that we start with inp[2:3]*r^2. This is because it + # doesn't depend on reduction in previous iteration. + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # though note that $Tx and $Hx are "reversed" in this section, + # and $D4 is preloaded with r0^2... + + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vmovdqa $H2,0x20(%r11) # offload hash + vpmuludq $T2,$D4,$D2 # d3 = h2*r0 + vmovdqa 0x10(%rsp),$H2 # r1^2 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vmovdqa $H0,0x00(%r11) # + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 + vmovdqa $H1,0x10(%r11) # + vpmuludq $T3,$H2,$H1 # h3*r1 + vpaddq $H0,$D0,$D0 # d0 += h4*s1 + vpaddq $H1,$D4,$D4 # d4 += h3*r1 + vmovdqa $H3,0x30(%r11) # + vpmuludq $T2,$H2,$H0 # h2*r1 + vpmuludq $T1,$H2,$H1 # h1*r1 + vpaddq $H0,$D3,$D3 # d3 += h2*r1 + vmovdqa 0x30(%rsp),$H3 # r2^2 + vpaddq $H1,$D2,$D2 # d2 += h1*r1 + vmovdqa $H4,0x40(%r11) # + vpmuludq $T0,$H2,$H2 # h0*r1 + vpmuludq $T2,$H3,$H0 # h2*r2 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + + vmovdqa 0x40(%rsp),$H4 # s2^2 + vpaddq $H0,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H3,$H1 # h1*r2 + vpmuludq $T0,$H3,$H3 # h0*r2 + vpaddq $H1,$D3,$D3 # d3 += h1*r2 + vmovdqa 0x50(%rsp),$H2 # r3^2 + vpaddq $H3,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H4,$H0 # h4*s2 + vpmuludq $T3,$H4,$H4 # h3*s2 + vpaddq $H0,$D1,$D1 # d1 += h4*s2 + vmovdqa 0x60(%rsp),$H3 # s3^2 + vpaddq $H4,$D0,$D0 # d0 += h3*s2 + + vmovdqa 0x80(%rsp),$H4 # s4^2 + vpmuludq $T1,$H2,$H1 # h1*r3 + vpmuludq $T0,$H2,$H2 # h0*r3 + vpaddq $H1,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $T4,$H3,$H0 # h4*s3 + vpmuludq $T3,$H3,$H1 # h3*s3 + vpaddq $H0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*0($inp),$H0 # load input + vpaddq $H1,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H3,$H3 # h2*s3 + vpmuludq $T2,$H4,$T2 # h2*s4 + vpaddq $H3,$D0,$D0 # d0 += h2*s3 + + vmovdqu 16*1($inp),$H1 # + vpaddq $T2,$D1,$D1 # d1 += h2*s4 + vpmuludq $T3,$H4,$T3 # h3*s4 + vpmuludq $T4,$H4,$T4 # h4*s4 + vpsrldq \$6,$H0,$H2 # splat input + vpaddq $T3,$D2,$D2 # d2 += h3*s4 + vpaddq $T4,$D3,$D3 # d3 += h4*s4 + vpsrldq \$6,$H1,$H3 # + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 + vpmuludq $T1,$H4,$T0 # h1*s4 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpaddq $T4,$D4,$D4 # d4 += h0*r4 + vmovdqa -0x90(%r11),$T4 # r0^4 + vpaddq $T0,$D0,$D0 # d0 += h1*s4 + + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + #vpsrlq \$40,$H4,$H4 # 4 + vpsrldq \$`40/8`,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpand 0(%rcx),$H4,$H4 # .Lmask24 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpaddq 0x00(%r11),$H0,$H0 # add hash value + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + lea 16*2($inp),%rax + lea 16*4($inp),$inp + sub \$64,$len + cmovc %rax,$inp + + ################################################################ + # Now we accumulate (inp[0:1]+hash)*r^4 + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vmovdqa -0x80(%r11),$T2 # r1^4 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T0,$D2,$D2 + vpaddq $T1,$D3,$D3 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 + vpaddq $T4,$D4,$D4 + + vpaddq $T0,$D0,$D0 # d0 += h4*s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vmovdqa -0x60(%r11),$T3 # r2^4 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpmuludq $H1,$T2,$T1 # h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T1,$D2,$D2 # d2 += h1*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + + vmovdqa -0x50(%r11),$T4 # s2^4 + vpmuludq $H2,$T3,$T0 # h2*r2 + vpmuludq $H1,$T3,$T1 # h1*r2 + vpaddq $T0,$D4,$D4 # d4 += h2*r2 + vpaddq $T1,$D3,$D3 # d3 += h1*r2 + vmovdqa -0x40(%r11),$T2 # r3^4 + vpmuludq $H0,$T3,$T3 # h0*r2 + vpmuludq $H4,$T4,$T0 # h4*s2 + vpaddq $T3,$D2,$D2 # d2 += h0*r2 + vpaddq $T0,$D1,$D1 # d1 += h4*s2 + vmovdqa -0x30(%r11),$T3 # s3^4 + vpmuludq $H3,$T4,$T4 # h3*s2 + vpmuludq $H1,$T2,$T1 # h1*r3 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + + vmovdqa -0x10(%r11),$T4 # s4^4 + vpaddq $T1,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T2,$T2 # h0*r3 + vpmuludq $H4,$T3,$T0 # h4*s3 + vpaddq $T2,$D3,$D3 # d3 += h0*r3 + vpaddq $T0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*2($inp),$T0 # load input + vpmuludq $H3,$T3,$T2 # h3*s3 + vpmuludq $H2,$T3,$T3 # h2*s3 + vpaddq $T2,$D1,$D1 # d1 += h3*s3 + vmovdqu 16*3($inp),$T1 # + vpaddq $T3,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H2,$T4,$H2 # h2*s4 + vpmuludq $H3,$T4,$H3 # h3*s4 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $H2,$D1,$D1 # d1 += h2*s4 + vpmuludq $H4,$T4,$H4 # h4*s4 + vpsrldq \$6,$T1,$T3 # + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 + vpmuludq $H1,$T4,$H0 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + #vpsrlq \$40,$T4,$T4 # 4 + vpsrldq \$`40/8`,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpand 0(%rcx),$T4,$T4 # .Lmask24 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D0 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D0,$H0,$H0 + vpsllq \$2,$D0,$D0 + vpaddq $D0,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + ja .Loop_avx + +.Lskip_loop_avx: + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 + add \$32,$len + jnz .Long_tail_avx + + vpaddq $H2,$T2,$T2 + vpaddq $H0,$T0,$T0 + vpaddq $H1,$T1,$T1 + vpaddq $H3,$T3,$T3 + vpaddq $H4,$T4,$T4 + +.Long_tail_avx: + vmovdqa $H2,0x20(%r11) + vmovdqa $H0,0x00(%r11) + vmovdqa $H1,0x10(%r11) + vmovdqa $H3,0x30(%r11) + vmovdqa $H4,0x40(%r11) + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $T2,$D4,$D2 # d2 = h2*r0 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vpmuludq $T3,$H2,$H0 # h3*r1 + vpaddq $H0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n + vpmuludq $T2,$H2,$H1 # h2*r1 + vpaddq $H1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n + vpmuludq $T1,$H2,$H0 # h1*r1 + vpaddq $H0,$D2,$D2 # d2 += h1*r1 + vpmuludq $T0,$H2,$H2 # h0*r1 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + vpmuludq $T4,$H3,$H3 # h4*s1 + vpaddq $H3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n + vpmuludq $T2,$H4,$H1 # h2*r2 + vpaddq $H1,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H4,$H0 # h1*r2 + vpaddq $H0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n + vpmuludq $T0,$H4,$H4 # h0*r2 + vpaddq $H4,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H2,$H1 # h4*s2 + vpaddq $H1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n + vpmuludq $T3,$H2,$H2 # h3*s2 + vpaddq $H2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $T1,$H3,$H0 # h1*r3 + vpaddq $H0,$D4,$D4 # d4 += h1*r3 + vpmuludq $T0,$H3,$H3 # h0*r3 + vpaddq $H3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n + vpmuludq $T4,$H4,$H1 # h4*s3 + vpaddq $H1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n + vpmuludq $T3,$H4,$H0 # h3*s3 + vpaddq $H0,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H4,$H4 # h2*s3 + vpaddq $H4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $T0,$H2,$H2 # h0*r4 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 + vpmuludq $T4,$H3,$H1 # h4*s4 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 + vpmuludq $T3,$H3,$H0 # h3*s4 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 + vpmuludq $T2,$H3,$H1 # h2*s4 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 + vpmuludq $T1,$H3,$H3 # h1*s4 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 + + jz .Lshort_tail_avx + + vmovdqu 16*0($inp),$H0 # load input + vmovdqu 16*1($inp),$H1 + + vpsrldq \$6,$H0,$H2 # splat input + vpsrldq \$6,$H1,$H3 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + vpsrlq \$40,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 + vpaddq 0x00(%r11),$H0,$H0 + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + ################################################################ + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpaddq $T0,$D0,$D0 # d0 += h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T1,$D1,$D1 # d1 += h1*r0 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpaddq $T0,$D2,$D2 # d2 += h2*r0 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T1,$D3,$D3 # d3 += h3*r0 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpaddq $T4,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 + vpmuludq $H1,$T2,$T0 # h1*r1 + vpaddq $T0,$D2,$D2 # d2 += h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + vpmuludq $H4,$T3,$T3 # h4*s1 + vpaddq $T3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 + vpmuludq $H2,$T4,$T1 # h2*r2 + vpaddq $T1,$D4,$D4 # d4 += h2*r2 + vpmuludq $H1,$T4,$T0 # h1*r2 + vpaddq $T0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 + vpmuludq $H0,$T4,$T4 # h0*r2 + vpaddq $T4,$D2,$D2 # d2 += h0*r2 + vpmuludq $H4,$T2,$T1 # h4*s2 + vpaddq $T1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 + vpmuludq $H3,$T2,$T2 # h3*s2 + vpaddq $T2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $H1,$T3,$T0 # h1*r3 + vpaddq $T0,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T3,$T3 # h0*r3 + vpaddq $T3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 + vpmuludq $H4,$T4,$T1 # h4*s3 + vpaddq $T1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 + vpmuludq $H3,$T4,$T0 # h3*s3 + vpaddq $T0,$D1,$D1 # d1 += h3*s3 + vpmuludq $H2,$T4,$T4 # h2*s3 + vpaddq $T4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H0,$T2,$T2 # h0*r4 + vpaddq $T2,$D4,$D4 # d4 += h0*r4 + vpmuludq $H4,$T3,$T1 # h4*s4 + vpaddq $T1,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$T3,$T0 # h3*s4 + vpaddq $T0,$D2,$D2 # d2 += h3*s4 + vpmuludq $H2,$T3,$T1 # h2*s4 + vpaddq $T1,$D1,$D1 # d1 += h2*s4 + vpmuludq $H1,$T3,$T3 # h1*s4 + vpaddq $T3,$D0,$D0 # d0 += h1*s4 + +.Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D4,$H4 + vpand $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$H1 + vpand $MASK,$D1,$D1 + vpaddq $H1,$D2,$D2 # h1 -> h2 + + vpaddq $H4,$D0,$D0 + vpsllq \$2,$H4,$H4 + vpaddq $H4,$D0,$D0 # h4 -> h0 + + vpsrlq \$26,$D2,$H2 + vpand $MASK,$D2,$D2 + vpaddq $H2,$D3,$D3 # h2 -> h3 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx_epilogue: +___ +$code.=<<___ if (!$win64); + lea 0x58(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +.size poly1305_blocks_avx,.-poly1305_blocks_avx + +.type poly1305_emit_avx,\@function,3 +.align 32 +poly1305_emit_avx: + cmpl \$0,20($ctx) # is_base2_26? + je .Lemit + + mov 0($ctx),%eax # load hash value base 2^26 + mov 4($ctx),%ecx + mov 8($ctx),%r8d + mov 12($ctx),%r11d + mov 16($ctx),%r10d + + shl \$26,%rcx # base 2^26 -> base 2^64 + mov %r8,%r9 + shl \$52,%r8 + add %rcx,%rax + shr \$12,%r9 + add %rax,%r8 # h0 + adc \$0,%r9 + + shl \$14,%r11 + mov %r10,%rax + shr \$24,%r10 + add %r11,%r9 + shl \$40,%rax + add %rax,%r9 # h1 + adc \$0,%r10 # h2 + + mov %r10,%rax # could be partially reduced, so reduce + mov %r10,%rcx + and \$3,%r10 + shr \$2,%rax + and \$-4,%rcx + add %rcx,%rax + add %rax,%r8 + adc \$0,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit_avx,.-poly1305_emit_avx +___ + +if ($avx>1) { +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = + map("%ymm$_",(0..15)); +my $S4=$MASK; + +$code.=<<___; +.type poly1305_blocks_avx2,\@function,4 +.align 32 +poly1305_blocks_avx2: +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx2 + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + and \$-16,$len + jz .Lno_data_avx2 + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx2 + + test \$63,$len + jz .Leven_avx2 + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx2_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + +.Lbase2_26_pre_avx2: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_26_pre_avx2 + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx2 # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + test %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx2: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx2: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx2_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$63,$len + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx2: + mov %r15,$len # restore $len + mov OPENSSL_ia32cap_P+8(%rip),%r10d + mov \$`(1<<31|1<<30|1<<16)`,%r11d + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rax + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 +.cfi_endproc + +.align 32 +.Leven_avx2: +.cfi_startproc + mov OPENSSL_ia32cap_P+8(%rip),%r10d + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 + vmovd 4*1($ctx),%x#$H1 + vmovd 4*2($ctx),%x#$H2 + vmovd 4*3($ctx),%x#$H3 + vmovd 4*4($ctx),%x#$H4 + +.Ldo_avx2: +___ +$code.=<<___ if ($avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r10d + test \$`1<<16`,%r10d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512: +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx2_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 + + # expand and copy pre-calculated table to stack + vmovdqu `16*0-64`($ctx),%x#$T2 + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$T3 + vmovdqu `16*2-64`($ctx),%x#$T4 + vmovdqu `16*3-64`($ctx),%x#$D0 + vmovdqu `16*4-64`($ctx),%x#$D1 + vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization + vmovdqu `16*6-64`($ctx),%x#$D3 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 + vmovdqu `16*7-64`($ctx),%x#$D4 + vpermd $T3,$T0,$T3 + vmovdqu `16*8-64`($ctx),%x#$MASK + vpermd $T4,$T0,$T4 + vmovdqa $T2,0x00(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) + vmovdqa 64(%rcx),$MASK # .Lmask26 + + ################################################################ + # load input + vmovdqu 16*0($inp),%x#$T0 + vmovdqu 16*1($inp),%x#$T1 + vinserti128 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$64,$len + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + ################################################################ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqa `32*0`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqa `32*1`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqa `32*3`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + vmovdqa `32*4-0x90`(%rax),$T1 # s2 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vmovdqu 16*0($inp),%x#$T0 # load input + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + vinserti128 \$1,16*2($inp),$T0,$T0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vmovdqu 16*1($inp),%x#$T1 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqa `32*5-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpsrldq \$6,$T1,$T3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + vpunpckhqdq $T1,$T0,$T4 # 4 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # lazy reduction (interleaved with tail of input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$4,$T3,$T2 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpand $MASK,$T2,$T2 # 2 + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$30,$T3,$T3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + sub \$64,$len + jnz .Loop_avx2 + + .byte 0x66,0x90 +.Ltail_avx2: + ################################################################ + # while above multiplications were by r^4 in all lanes, in last + # iteration we multiply least significant lane by r^4 and most + # significant one by r, so copy of above except that references + # to the precomputed table are displaced by 4... + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1 + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx2_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +___ +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.type poly1305_blocks_avx512,\@function,4 +.align 32 +poly1305_blocks_avx512: +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2 + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa 0x50(%r11),%xmm6 + movdqa 0x60(%r11),%xmm7 + movdqa 0x70(%r11),%xmm8 + movdqa 0x80(%r11),%xmm9 + movdqa 0x90(%r11),%xmm10 + movdqa 0xa0(%r11),%xmm11 + movdqa 0xb0(%r11),%xmm12 + movdqa 0xc0(%r11),%xmm13 + movdqa 0xd0(%r11),%xmm14 + movdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + ret +.cfi_endproc +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 +___ +if ($avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + ret +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + ret +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ +} +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + ret +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + ret +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___; +.asciz "Poly1305 for x86_64, CRYPTOGAMS by " +.align 16 +___ + +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler + +.type avx_handler,\@abi-omnipotent +.align 16 +avx_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea 0x50(%rax),%rsi + lea 0xf8(%rax),%rax + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size avx_handler,.-avx_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_poly1305_init + .rva .LSEH_end_poly1305_init + .rva .LSEH_info_poly1305_init + + .rva .LSEH_begin_poly1305_blocks + .rva .LSEH_end_poly1305_blocks + .rva .LSEH_info_poly1305_blocks + + .rva .LSEH_begin_poly1305_emit + .rva .LSEH_end_poly1305_emit + .rva .LSEH_info_poly1305_emit +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_poly1305_blocks_avx + .rva .Lbase2_64_avx + .rva .LSEH_info_poly1305_blocks_avx_1 + + .rva .Lbase2_64_avx + .rva .Leven_avx + .rva .LSEH_info_poly1305_blocks_avx_2 + + .rva .Leven_avx + .rva .LSEH_end_poly1305_blocks_avx + .rva .LSEH_info_poly1305_blocks_avx_3 + + .rva .LSEH_begin_poly1305_emit_avx + .rva .LSEH_end_poly1305_emit_avx + .rva .LSEH_info_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_poly1305_blocks_avx2 + .rva .Lbase2_64_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_1 + + .rva .Lbase2_64_avx2 + .rva .Leven_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_2 + + .rva .Leven_avx2 + .rva .LSEH_end_poly1305_blocks_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_3 +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_poly1305_init: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init + +.LSEH_info_poly1305_blocks: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_body,.Lblocks_epilogue + +.LSEH_info_poly1305_emit: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit +___ +$code.=<<___ if ($avx); +.LSEH_info_poly1305_blocks_avx_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_emit_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); +.LSEH_info_poly1305_blocks_avx2_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ +} + +foreach (split('\n',$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/%r([a-z]+)#d/%e$1/g; + s/%r([0-9]+)#d/%r$1d/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + + print $_,"\n"; +} +close STDOUT; -- cgit v1.2.3 From d7d7b853566254648df59f7ea27ea05952a6cfa8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 5 Jan 2020 22:40:48 -0500 Subject: crypto: x86/poly1305 - wire up faster implementations for kernel These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F. The AVX-512F implementation is disabled on Skylake, due to throttling, but it is quite fast on >= Cannonlake. On the left is cycle counts on a Core i7 6700HQ using the AVX-2 codepath, comparing this implementation ("new") to the implementation in the current crypto api ("old"). On the right are benchmarks on a Xeon Gold 5120 using the AVX-512 codepath. The new implementation is faster on all benchmarks. AVX-2 AVX-512 --------- ----------- size old new size old new ---- ---- ---- ---- ---- ---- 0 70 68 0 74 70 16 92 90 16 96 92 32 134 104 32 136 106 48 172 120 48 184 124 64 218 136 64 218 138 80 254 158 80 260 160 96 298 174 96 300 176 112 342 192 112 342 194 128 388 212 128 384 212 144 428 228 144 420 226 160 466 246 160 464 248 176 510 264 176 504 264 192 550 282 192 544 282 208 594 302 208 582 300 224 628 316 224 624 318 240 676 334 240 662 338 256 716 354 256 708 358 272 764 374 272 748 372 288 802 352 288 788 358 304 420 366 304 422 370 320 428 360 320 432 364 336 484 378 336 486 380 352 426 384 352 434 390 368 478 400 368 480 408 384 488 394 384 490 398 400 542 408 400 542 412 416 486 416 416 492 426 432 534 430 432 538 436 448 544 422 448 546 432 464 600 438 464 600 448 480 540 448 480 548 456 496 594 464 496 594 476 512 602 456 512 606 470 528 656 476 528 656 480 544 600 480 544 606 498 560 650 494 560 652 512 576 664 490 576 662 508 592 714 508 592 716 522 608 656 514 608 664 538 624 708 532 624 710 552 640 716 524 640 720 516 656 770 536 656 772 526 672 716 548 672 722 544 688 770 562 688 768 556 704 774 552 704 778 556 720 826 568 720 832 568 736 768 574 736 780 584 752 822 592 752 826 600 768 830 584 768 836 560 784 884 602 784 888 572 800 828 610 800 838 588 816 884 628 816 884 604 832 888 618 832 894 598 848 942 632 848 946 612 864 884 644 864 896 628 880 936 660 880 942 644 896 948 652 896 952 608 912 1000 664 912 1004 616 928 942 676 928 954 634 944 994 690 944 1000 646 960 1002 680 960 1008 646 976 1054 694 976 1062 658 992 1002 706 992 1012 674 1008 1052 720 1008 1058 690 This commit wires in the prior implementation from Andy, and makes the following changes to be suitable for kernel land. - Some cosmetic and structural changes, like renaming labels to .Lname, constants, and other Linux conventions, as well as making the code easy for us to maintain moving forward. - CPU feature checking is done in C by the glue code. - We avoid jumping into the middle of functions, to appease objtool, and instead parameterize shared code. - We maintain frame pointers so that stack traces make sense. - We remove the dependency on the perl xlate code, which transforms the output into things that assemblers we don't care about use. Importantly, none of our changes affect the arithmetic or core code, but just involve the differing environment of kernel space. Signed-off-by: Jason A. Donenfeld Signed-off-by: Samuel Neves Co-developed-by: Samuel Neves Signed-off-by: Herbert Xu --- arch/x86/crypto/.gitignore | 1 + arch/x86/crypto/Makefile | 11 +- arch/x86/crypto/poly1305-avx2-x86_64.S | 390 --------------- arch/x86/crypto/poly1305-sse2-x86_64.S | 590 ---------------------- arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 +++++++++++++++----------- arch/x86/crypto/poly1305_glue.c | 473 +++++++----------- lib/crypto/Kconfig | 2 +- 7 files changed, 572 insertions(+), 1577 deletions(-) create mode 100644 arch/x86/crypto/.gitignore delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S (limited to 'arch/x86') diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore new file mode 100644 index 000000000000..c406ea6571fa --- /dev/null +++ b/arch/x86/crypto/.gitignore @@ -0,0 +1 @@ +poly1305-x86_64.S diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 958440eae27e..b69e00bf20b8 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o blake2s-x86_64-y := blake2s-core.o blake2s-glue.o +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),) +targets += poly1305-x86_64-cryptogams.S +endif ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o -poly1305-x86_64-y += poly1305-avx2-x86_64.o endif ifeq ($(sha1_ni_supported),yes) sha1-ssse3-y += sha1_ni_asm.o @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ +$(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perlasm) diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S deleted file mode 100644 index 8f56989ea599..000000000000 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ /dev/null @@ -1,390 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include - -.section .rodata.cst32.ANMASK, "aM", @progbits, 32 -.align 32 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff - .octa 0x0000000003ffffff0000000003ffffff - -.section .rodata.cst32.ORMASK, "aM", @progbits, 32 -.align 32 -ORMASK: .octa 0x00000000010000000000000001000000 - .octa 0x00000000010000000000000001000000 - -.text - -#define h0 0x00(%rdi) -#define h1 0x04(%rdi) -#define h2 0x08(%rdi) -#define h3 0x0c(%rdi) -#define h4 0x10(%rdi) -#define r0 0x00(%rdx) -#define r1 0x04(%rdx) -#define r2 0x08(%rdx) -#define r3 0x0c(%rdx) -#define r4 0x10(%rdx) -#define u0 0x00(%r8) -#define u1 0x04(%r8) -#define u2 0x08(%r8) -#define u3 0x0c(%r8) -#define u4 0x10(%r8) -#define w0 0x18(%r8) -#define w1 0x1c(%r8) -#define w2 0x20(%r8) -#define w3 0x24(%r8) -#define w4 0x28(%r8) -#define y0 0x30(%r8) -#define y1 0x34(%r8) -#define y2 0x38(%r8) -#define y3 0x3c(%r8) -#define y4 0x40(%r8) -#define m %rsi -#define hc0 %ymm0 -#define hc1 %ymm1 -#define hc2 %ymm2 -#define hc3 %ymm3 -#define hc4 %ymm4 -#define hc0x %xmm0 -#define hc1x %xmm1 -#define hc2x %xmm2 -#define hc3x %xmm3 -#define hc4x %xmm4 -#define t1 %ymm5 -#define t2 %ymm6 -#define t1x %xmm5 -#define t2x %xmm6 -#define ruwy0 %ymm7 -#define ruwy1 %ymm8 -#define ruwy2 %ymm9 -#define ruwy3 %ymm10 -#define ruwy4 %ymm11 -#define ruwy0x %xmm7 -#define ruwy1x %xmm8 -#define ruwy2x %xmm9 -#define ruwy3x %xmm10 -#define ruwy4x %xmm11 -#define svxz1 %ymm12 -#define svxz2 %ymm13 -#define svxz3 %ymm14 -#define svxz4 %ymm15 -#define d0 %r9 -#define d1 %r10 -#define d2 %r11 -#define d3 %r12 -#define d4 %r13 - -SYM_FUNC_START(poly1305_4block_avx2) - # %rdi: Accumulator h[5] - # %rsi: 64 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Quadblock count - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], - - # This four-block variant uses loop unrolled block processing. It - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r - - vzeroupper - push %rbx - push %r12 - push %r13 - - # combine r0,u0,w0,y0 - vmovd y0,ruwy0x - vmovd w0,t1x - vpunpcklqdq t1,ruwy0,ruwy0 - vmovd u0,t1x - vmovd r0,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy0,ruwy0 - - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 - vmovd y1,ruwy1x - vmovd w1,t1x - vpunpcklqdq t1,ruwy1,ruwy1 - vmovd u1,t1x - vmovd r1,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy1,ruwy1 - vpslld $2,ruwy1,svxz1 - vpaddd ruwy1,svxz1,svxz1 - - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 - vmovd y2,ruwy2x - vmovd w2,t1x - vpunpcklqdq t1,ruwy2,ruwy2 - vmovd u2,t1x - vmovd r2,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy2,ruwy2 - vpslld $2,ruwy2,svxz2 - vpaddd ruwy2,svxz2,svxz2 - - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 - vmovd y3,ruwy3x - vmovd w3,t1x - vpunpcklqdq t1,ruwy3,ruwy3 - vmovd u3,t1x - vmovd r3,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy3,ruwy3 - vpslld $2,ruwy3,svxz3 - vpaddd ruwy3,svxz3,svxz3 - - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 - vmovd y4,ruwy4x - vmovd w4,t1x - vpunpcklqdq t1,ruwy4,ruwy4 - vmovd u4,t1x - vmovd r4,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy4,ruwy4 - vpslld $2,ruwy4,svxz4 - vpaddd ruwy4,svxz4,svxz4 - -.Ldoblock4: - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] - vmovd 0x00(m),hc0x - vmovd 0x10(m),t1x - vpunpcklqdq t1,hc0,hc0 - vmovd 0x20(m),t1x - vmovd 0x30(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc0,hc0 - vpand ANMASK(%rip),hc0,hc0 - vmovd h0,t1x - vpaddd t1,hc0,hc0 - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] - vmovd 0x03(m),hc1x - vmovd 0x13(m),t1x - vpunpcklqdq t1,hc1,hc1 - vmovd 0x23(m),t1x - vmovd 0x33(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc1,hc1 - vpsrld $2,hc1,hc1 - vpand ANMASK(%rip),hc1,hc1 - vmovd h1,t1x - vpaddd t1,hc1,hc1 - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] - vmovd 0x06(m),hc2x - vmovd 0x16(m),t1x - vpunpcklqdq t1,hc2,hc2 - vmovd 0x26(m),t1x - vmovd 0x36(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc2,hc2 - vpsrld $4,hc2,hc2 - vpand ANMASK(%rip),hc2,hc2 - vmovd h2,t1x - vpaddd t1,hc2,hc2 - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] - vmovd 0x09(m),hc3x - vmovd 0x19(m),t1x - vpunpcklqdq t1,hc3,hc3 - vmovd 0x29(m),t1x - vmovd 0x39(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc3,hc3 - vpsrld $6,hc3,hc3 - vpand ANMASK(%rip),hc3,hc3 - vmovd h3,t1x - vpaddd t1,hc3,hc3 - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] - vmovd 0x0c(m),hc4x - vmovd 0x1c(m),t1x - vpunpcklqdq t1,hc4,hc4 - vmovd 0x2c(m),t1x - vmovd 0x3c(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc4,hc4 - vpsrld $8,hc4,hc4 - vpor ORMASK(%rip),hc4,hc4 - vmovd h4,t1x - vpaddd t1,hc4,hc4 - - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] - vpmuludq hc0,ruwy0,t1 - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] - vpmuludq hc1,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] - vpmuludq hc2,svxz3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] - vpmuludq hc3,svxz2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] - vpmuludq hc4,svxz1,t2 - vpaddq t2,t1,t1 - # d0 = t1[0] + t1[1] + t[2] + t[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d0 - - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] - vpmuludq hc0,ruwy1,t1 - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] - vpmuludq hc1,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] - vpmuludq hc2,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] - vpmuludq hc3,svxz3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] - vpmuludq hc4,svxz2,t2 - vpaddq t2,t1,t1 - # d1 = t1[0] + t1[1] + t1[3] + t1[4] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d1 - - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] - vpmuludq hc0,ruwy2,t1 - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] - vpmuludq hc1,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] - vpmuludq hc2,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] - vpmuludq hc3,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] - vpmuludq hc4,svxz3,t2 - vpaddq t2,t1,t1 - # d2 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d2 - - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] - vpmuludq hc0,ruwy3,t1 - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] - vpmuludq hc1,ruwy2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] - vpmuludq hc2,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] - vpmuludq hc3,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] - vpmuludq hc4,svxz4,t2 - vpaddq t2,t1,t1 - # d3 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d3 - - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] - vpmuludq hc0,ruwy4,t1 - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] - vpmuludq hc1,ruwy3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] - vpmuludq hc2,ruwy2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] - vpmuludq hc3,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] - vpmuludq hc4,ruwy0,t2 - vpaddq t2,t1,t1 - # d4 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d4 - - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small - # amount. Careful: we must not assume the carry bits 'd0 >> 26', - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit - # integers. It's true in a single-block implementation, but not here. - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x40,m - dec %rcx - jnz .Ldoblock4 - - vzeroupper - pop %r13 - pop %r12 - pop %rbx - ret -SYM_FUNC_END(poly1305_4block_avx2) diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S deleted file mode 100644 index d8ea29b96640..000000000000 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ /dev/null @@ -1,590 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include - -.section .rodata.cst16.ANMASK, "aM", @progbits, 16 -.align 16 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff - -.section .rodata.cst16.ORMASK, "aM", @progbits, 16 -.align 16 -ORMASK: .octa 0x00000000010000000000000001000000 - -.text - -#define h0 0x00(%rdi) -#define h1 0x04(%rdi) -#define h2 0x08(%rdi) -#define h3 0x0c(%rdi) -#define h4 0x10(%rdi) -#define r0 0x00(%rdx) -#define r1 0x04(%rdx) -#define r2 0x08(%rdx) -#define r3 0x0c(%rdx) -#define r4 0x10(%rdx) -#define s1 0x00(%rsp) -#define s2 0x04(%rsp) -#define s3 0x08(%rsp) -#define s4 0x0c(%rsp) -#define m %rsi -#define h01 %xmm0 -#define h23 %xmm1 -#define h44 %xmm2 -#define t1 %xmm3 -#define t2 %xmm4 -#define t3 %xmm5 -#define t4 %xmm6 -#define mask %xmm7 -#define d0 %r8 -#define d1 %r9 -#define d2 %r10 -#define d3 %r11 -#define d4 %r12 - -SYM_FUNC_START(poly1305_block_sse2) - # %rdi: Accumulator h[5] - # %rsi: 16 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Block count - - # This single block variant tries to improve performance by doing two - # multiplications in parallel using SSE instructions. There is quite - # some quardword packing involved, hence the speedup is marginal. - - push %rbx - push %r12 - sub $0x10,%rsp - - # s1..s4 = r1..r4 * 5 - mov r1,%eax - lea (%eax,%eax,4),%eax - mov %eax,s1 - mov r2,%eax - lea (%eax,%eax,4),%eax - mov %eax,s2 - mov r3,%eax - lea (%eax,%eax,4),%eax - mov %eax,s3 - mov r4,%eax - lea (%eax,%eax,4),%eax - mov %eax,s4 - - movdqa ANMASK(%rip),mask - -.Ldoblock: - # h01 = [0, h1, 0, h0] - # h23 = [0, h3, 0, h2] - # h44 = [0, h4, 0, h4] - movd h0,h01 - movd h1,t1 - movd h2,h23 - movd h3,t2 - movd h4,h44 - punpcklqdq t1,h01 - punpcklqdq t2,h23 - punpcklqdq h44,h44 - - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] - movd 0x00(m),t1 - movd 0x03(m),t2 - psrld $2,t2 - punpcklqdq t2,t1 - pand mask,t1 - paddd t1,h01 - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] - movd 0x06(m),t1 - movd 0x09(m),t2 - psrld $4,t1 - psrld $6,t2 - punpcklqdq t2,t1 - pand mask,t1 - paddd t1,h23 - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] - mov 0x0c(m),%eax - shr $8,%eax - or $0x01000000,%eax - movd %eax,t1 - pshufd $0xc4,t1,t1 - paddd t1,h44 - - # t1[0] = h0 * r0 + h2 * s3 - # t1[1] = h1 * s4 + h3 * s2 - movd r0,t1 - movd s4,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd s3,t2 - movd s2,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t2[0] = h0 * r1 + h2 * s4 - # t2[1] = h1 * r0 + h3 * s3 - movd r1,t2 - movd r0,t3 - punpcklqdq t3,t2 - pmuludq h01,t2 - movd s4,t3 - movd s3,t4 - punpcklqdq t4,t3 - pmuludq h23,t3 - paddq t3,t2 - # t3[0] = h4 * s1 - # t3[1] = h4 * s2 - movd s1,t3 - movd s2,t4 - punpcklqdq t4,t3 - pmuludq h44,t3 - # d0 = t1[0] + t1[1] + t3[0] - # d1 = t2[0] + t2[1] + t3[1] - movdqa t1,t4 - punpcklqdq t2,t4 - punpckhqdq t2,t1 - paddq t4,t1 - paddq t3,t1 - movq t1,d0 - psrldq $8,t1 - movq t1,d1 - - # t1[0] = h0 * r2 + h2 * r0 - # t1[1] = h1 * r1 + h3 * s4 - movd r2,t1 - movd r1,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd r0,t2 - movd s4,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t2[0] = h0 * r3 + h2 * r1 - # t2[1] = h1 * r2 + h3 * r0 - movd r3,t2 - movd r2,t3 - punpcklqdq t3,t2 - pmuludq h01,t2 - movd r1,t3 - movd r0,t4 - punpcklqdq t4,t3 - pmuludq h23,t3 - paddq t3,t2 - # t3[0] = h4 * s3 - # t3[1] = h4 * s4 - movd s3,t3 - movd s4,t4 - punpcklqdq t4,t3 - pmuludq h44,t3 - # d2 = t1[0] + t1[1] + t3[0] - # d3 = t2[0] + t2[1] + t3[1] - movdqa t1,t4 - punpcklqdq t2,t4 - punpckhqdq t2,t1 - paddq t4,t1 - paddq t3,t1 - movq t1,d2 - psrldq $8,t1 - movq t1,d3 - - # t1[0] = h0 * r4 + h2 * r2 - # t1[1] = h1 * r3 + h3 * r1 - movd r4,t1 - movd r3,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd r2,t2 - movd r1,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t3[0] = h4 * r0 - movd r0,t3 - pmuludq h44,t3 - # d4 = t1[0] + t1[1] + t3[0] - movdqa t1,t4 - psrldq $8,t4 - paddq t4,t1 - paddq t3,t1 - movq t1,d4 - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x10,m - dec %rcx - jnz .Ldoblock - - # Zeroing of key material - mov %rcx,0x00(%rsp) - mov %rcx,0x08(%rsp) - - add $0x10,%rsp - pop %r12 - pop %rbx - ret -SYM_FUNC_END(poly1305_block_sse2) - - -#define u0 0x00(%r8) -#define u1 0x04(%r8) -#define u2 0x08(%r8) -#define u3 0x0c(%r8) -#define u4 0x10(%r8) -#define hc0 %xmm0 -#define hc1 %xmm1 -#define hc2 %xmm2 -#define hc3 %xmm5 -#define hc4 %xmm6 -#define ru0 %xmm7 -#define ru1 %xmm8 -#define ru2 %xmm9 -#define ru3 %xmm10 -#define ru4 %xmm11 -#define sv1 %xmm12 -#define sv2 %xmm13 -#define sv3 %xmm14 -#define sv4 %xmm15 -#undef d0 -#define d0 %r13 - -SYM_FUNC_START(poly1305_2block_sse2) - # %rdi: Accumulator h[5] - # %rsi: 16 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Doubleblock count - # %r8: Poly1305 derived key r^2 u[5] - - # This two-block variant further improves performance by using loop - # unrolled block processing. This is more straight forward and does - # less byte shuffling, but requires a second Poly1305 key r^2: - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r - - push %rbx - push %r12 - push %r13 - - # combine r0,u0 - movd u0,ru0 - movd r0,t1 - punpcklqdq t1,ru0 - - # combine r1,u1 and s1=r1*5,v1=u1*5 - movd u1,ru1 - movd r1,t1 - punpcklqdq t1,ru1 - movdqa ru1,sv1 - pslld $2,sv1 - paddd ru1,sv1 - - # combine r2,u2 and s2=r2*5,v2=u2*5 - movd u2,ru2 - movd r2,t1 - punpcklqdq t1,ru2 - movdqa ru2,sv2 - pslld $2,sv2 - paddd ru2,sv2 - - # combine r3,u3 and s3=r3*5,v3=u3*5 - movd u3,ru3 - movd r3,t1 - punpcklqdq t1,ru3 - movdqa ru3,sv3 - pslld $2,sv3 - paddd ru3,sv3 - - # combine r4,u4 and s4=r4*5,v4=u4*5 - movd u4,ru4 - movd r4,t1 - punpcklqdq t1,ru4 - movdqa ru4,sv4 - pslld $2,sv4 - paddd ru4,sv4 - -.Ldoblock2: - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] - movd 0x00(m),hc0 - movd 0x10(m),t1 - punpcklqdq t1,hc0 - pand ANMASK(%rip),hc0 - movd h0,t1 - paddd t1,hc0 - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] - movd 0x03(m),hc1 - movd 0x13(m),t1 - punpcklqdq t1,hc1 - psrld $2,hc1 - pand ANMASK(%rip),hc1 - movd h1,t1 - paddd t1,hc1 - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] - movd 0x06(m),hc2 - movd 0x16(m),t1 - punpcklqdq t1,hc2 - psrld $4,hc2 - pand ANMASK(%rip),hc2 - movd h2,t1 - paddd t1,hc2 - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] - movd 0x09(m),hc3 - movd 0x19(m),t1 - punpcklqdq t1,hc3 - psrld $6,hc3 - pand ANMASK(%rip),hc3 - movd h3,t1 - paddd t1,hc3 - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] - movd 0x0c(m),hc4 - movd 0x1c(m),t1 - punpcklqdq t1,hc4 - psrld $8,hc4 - por ORMASK(%rip),hc4 - movd h4,t1 - paddd t1,hc4 - - # t1 = [ hc0[1] * r0, hc0[0] * u0 ] - movdqa ru0,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * s4, hc1[0] * v4 ] - movdqa sv4,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * s3, hc2[0] * v3 ] - movdqa sv3,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s2, hc3[0] * v2 ] - movdqa sv2,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s1, hc4[0] * v1 ] - movdqa sv1,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d0 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d0 - - # t1 = [ hc0[1] * r1, hc0[0] * u1 ] - movdqa ru1,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r0, hc1[0] * u0 ] - movdqa ru0,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * s4, hc2[0] * v4 ] - movdqa sv4,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s3, hc3[0] * v3 ] - movdqa sv3,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s2, hc4[0] * v2 ] - movdqa sv2,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d1 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d1 - - # t1 = [ hc0[1] * r2, hc0[0] * u2 ] - movdqa ru2,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r1, hc1[0] * u1 ] - movdqa ru1,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r0, hc2[0] * u0 ] - movdqa ru0,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s4, hc3[0] * v4 ] - movdqa sv4,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s3, hc4[0] * v3 ] - movdqa sv3,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d2 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d2 - - # t1 = [ hc0[1] * r3, hc0[0] * u3 ] - movdqa ru3,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r2, hc1[0] * u2 ] - movdqa ru2,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r1, hc2[0] * u1 ] - movdqa ru1,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * r0, hc3[0] * u0 ] - movdqa ru0,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s4, hc4[0] * v4 ] - movdqa sv4,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d3 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d3 - - # t1 = [ hc0[1] * r4, hc0[0] * u4 ] - movdqa ru4,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r3, hc1[0] * u3 ] - movdqa ru3,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r2, hc2[0] * u2 ] - movdqa ru2,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * r1, hc3[0] * u1 ] - movdqa ru1,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * r0, hc4[0] * u0 ] - movdqa ru0,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d4 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d4 - - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small - # amount. Careful: we must not assume the carry bits 'd0 >> 26', - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit - # integers. It's true in a single-block implementation, but not here. - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x20,m - dec %rcx - jnz .Ldoblock2 - - pop %r13 - pop %r12 - pop %rbx - ret -SYM_FUNC_END(poly1305_2block_sse2) diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 342ad7f18aa7..7a6b5380a46f 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -1,11 +1,14 @@ -#! /usr/bin/env perl -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause # -# Licensed under the OpenSSL license (the "License"). You may not use -# this file except in compliance with the License. You can obtain a copy -# in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html - +# Copyright (C) 2017-2018 Samuel Neves . All Rights Reserved. +# Copyright (C) 2017-2019 Jason A. Donenfeld . All Rights Reserved. +# Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -32,7 +35,7 @@ # Skylake-X system performance. Since we are likely to suppress # AVX512F capability flag [at least on Skylake-X], conversion serves # as kind of "investment protection". Note that next *lake processor, -# Cannolake, has AVX512IFMA code path to execute... +# Cannonlake, has AVX512IFMA code path to execute... # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. @@ -68,39 +71,114 @@ $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); +$kernel=0; $kernel=1 if (!$flavour && !$output); + +if (!$kernel) { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or + die "can't locate x86_64-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; + + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); + } + + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); + $avx += 1 if ($1==2.11 && $2>=8); + } + + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + } + + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); + } +} else { + $avx = 4; # The kernel uses ifdefs for this. } -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); - $avx += 2 if ($1==2.11 && $2>=8); +sub declare_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= ".align $align\n"; + $code .= "SYM_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } } -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=12); +sub end_function() { + my ($name) = @_; + if($kernel) { + $code .= "SYM_FUNC_END($name)\n"; + } else { + $code .= ".size $name,.-$name\n"; + } } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); -} +$code.=<<___ if $kernel; +#include +___ + +if ($avx) { +$code.=<<___ if $kernel; +.section .rodata +___ +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; -*STDOUT=*OUT; +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___ if (!$kernel); +.asciz "Poly1305 for x86_64, CRYPTOGAMS by " +.align 16 +___ my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); sub poly1305_iteration { # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 @@ -155,19 +233,19 @@ ___ $code.=<<___; .text - +___ +$code.=<<___ if (!$kernel); .extern OPENSSL_ia32cap_P -.globl poly1305_init -.hidden poly1305_init -.globl poly1305_blocks -.hidden poly1305_blocks -.globl poly1305_emit -.hidden poly1305_emit - -.type poly1305_init,\@function,3 -.align 32 -poly1305_init: +.globl poly1305_init_x86_64 +.hidden poly1305_init_x86_64 +.globl poly1305_blocks_x86_64 +.hidden poly1305_blocks_x86_64 +.globl poly1305_emit_x86_64 +.hidden poly1305_emit_x86_64 +___ +&declare_function("poly1305_init_x86_64", 32, 3); +$code.=<<___; xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) @@ -175,11 +253,12 @@ poly1305_init: cmp \$0,$inp je .Lno_key - - lea poly1305_blocks(%rip),%r10 - lea poly1305_emit(%rip),%r11 ___ -$code.=<<___ if ($avx); +$code.=<<___ if (!$kernel); + lea poly1305_blocks_x86_64(%rip),%r10 + lea poly1305_emit_x86_64(%rip),%r11 +___ +$code.=<<___ if (!$kernel && $avx); mov OPENSSL_ia32cap_P+4(%rip),%r9 lea poly1305_blocks_avx(%rip),%rax lea poly1305_emit_avx(%rip),%rcx @@ -187,12 +266,12 @@ $code.=<<___ if ($avx); cmovc %rax,%r10 cmovc %rcx,%r11 ___ -$code.=<<___ if ($avx>1); +$code.=<<___ if (!$kernel && $avx>1); lea poly1305_blocks_avx2(%rip),%rax bt \$`5+32`,%r9 # AVX2? cmovc %rax,%r10 ___ -$code.=<<___ if ($avx>3); +$code.=<<___ if (!$kernel && $avx>3); mov \$`(1<<31|1<<21|1<<16)`,%rax shr \$32,%r9 and %rax,%r9 @@ -207,11 +286,11 @@ $code.=<<___; mov %rax,24($ctx) mov %rcx,32($ctx) ___ -$code.=<<___ if ($flavour !~ /elf32/); +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); mov %r10,0(%rdx) mov %r11,8(%rdx) ___ -$code.=<<___ if ($flavour =~ /elf32/); +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); mov %r10d,0(%rdx) mov %r11d,4(%rdx) ___ @@ -219,11 +298,11 @@ $code.=<<___; mov \$1,%eax .Lno_key: ret -.size poly1305_init,.-poly1305_init +___ +&end_function("poly1305_init_x86_64"); -.type poly1305_blocks,\@function,4 -.align 32 -poly1305_blocks: +&declare_function("poly1305_blocks_x86_64", 32, 4); +$code.=<<___; .cfi_startproc .Lblocks: shr \$4,$len @@ -231,8 +310,6 @@ poly1305_blocks: push %rbx .cfi_push %rbx - push %rbp -.cfi_push %rbp push %r12 .cfi_push %r12 push %r13 @@ -241,6 +318,8 @@ poly1305_blocks: .cfi_push %r14 push %r15 .cfi_push %r15 + push $ctx +.cfi_push $ctx .Lblocks_body: mov $len,%r15 # reassign $len @@ -265,26 +344,29 @@ poly1305_blocks: lea 16($inp),$inp adc $padbit,$h2 ___ + &poly1305_iteration(); + $code.=<<___; mov $r1,%rax dec %r15 # len-=16 jnz .Loop + mov 0(%rsp),$ctx +.cfi_restore $ctx + mov $h0,0($ctx) # store hash value mov $h1,8($ctx) mov $h2,16($ctx) - mov 0(%rsp),%r15 + mov 8(%rsp),%r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + mov 16(%rsp),%r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + mov 24(%rsp),%r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + mov 32(%rsp),%r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp @@ -293,11 +375,11 @@ $code.=<<___; .Lblocks_epilogue: ret .cfi_endproc -.size poly1305_blocks,.-poly1305_blocks +___ +&end_function("poly1305_blocks_x86_64"); -.type poly1305_emit,\@function,3 -.align 32 -poly1305_emit: +&declare_function("poly1305_emit_x86_64", 32, 3); +$code.=<<___; .Lemit: mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 @@ -318,10 +400,14 @@ poly1305_emit: mov %rcx,8($mac) ret -.size poly1305_emit,.-poly1305_emit ___ +&end_function("poly1305_emit_x86_64"); if ($avx) { +if($kernel) { + $code .= "#ifdef CONFIG_AS_AVX\n"; +} + ######################################################################## # Layout of opaque area is following. # @@ -342,15 +428,19 @@ $code.=<<___; .type __poly1305_block,\@abi-omnipotent .align 32 __poly1305_block: + push $ctx ___ &poly1305_iteration(); $code.=<<___; + pop $ctx ret .size __poly1305_block,.-__poly1305_block .type __poly1305_init_avx,\@abi-omnipotent .align 32 __poly1305_init_avx: + push %rbp + mov %rsp,%rbp mov $r0,$h0 mov $r1,$h1 xor $h2,$h2 @@ -507,12 +597,13 @@ __poly1305_init_avx: mov $d1#d,`16*8+8-64`($ctx) lea -48-64($ctx),$ctx # size [de-]optimization + pop %rbp ret .size __poly1305_init_avx,.-__poly1305_init_avx +___ -.type poly1305_blocks_avx,\@function,4 -.align 32 -poly1305_blocks_avx: +&declare_function("poly1305_blocks_avx", 32, 4); +$code.=<<___; .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len @@ -532,10 +623,11 @@ poly1305_blocks_avx: test \$31,$len jz .Leven_avx - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -645,20 +737,18 @@ poly1305_blocks_avx: mov $h2#d,16($ctx) .align 16 .Ldone_avx: - mov 0(%rsp),%r15 + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 + pop %rbp +.cfi_restore %rbp .Lno_data_avx: .Lblocks_avx_epilogue: ret @@ -667,10 +757,11 @@ poly1305_blocks_avx: .align 32 .Lbase2_64_avx: .cfi_startproc - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -736,22 +827,18 @@ poly1305_blocks_avx: .Lproceed_avx: mov %r15,$len - - mov 0(%rsp),%r15 + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rax - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 + pop %rbp +.cfi_restore %rbp .Lbase2_64_avx_epilogue: jmp .Ldo_avx .cfi_endproc @@ -768,8 +855,11 @@ poly1305_blocks_avx: .Ldo_avx: ___ $code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + and \$-32,%rsp + sub \$-8,%rsp lea -0x58(%rsp),%r11 -.cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); @@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64); .Ldo_avx_epilogue: ___ $code.=<<___ if (!$win64); - lea 0x58(%r11),%rsp -.cfi_def_cfa %rsp,8 + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp ___ $code.=<<___; vzeroupper ret .cfi_endproc -.size poly1305_blocks_avx,.-poly1305_blocks_avx +___ +&end_function("poly1305_blocks_avx"); -.type poly1305_emit_avx,\@function,3 -.align 32 -poly1305_emit_avx: +&declare_function("poly1305_emit_avx", 32, 3); +$code.=<<___; cmpl \$0,20($ctx) # is_base2_26? je .Lemit @@ -1423,41 +1513,51 @@ poly1305_emit_avx: mov %rcx,8($mac) ret -.size poly1305_emit_avx,.-poly1305_emit_avx ___ +&end_function("poly1305_emit_avx"); + +if ($kernel) { + $code .= "#endif\n"; +} if ($avx>1) { + +if ($kernel) { + $code .= "#ifdef CONFIG_AS_AVX2\n"; +} + my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = map("%ymm$_",(0..15)); my $S4=$MASK; +sub poly1305_blocks_avxN { + my ($avx512) = @_; + my $suffix = $avx512 ? "_avx512" : ""; $code.=<<___; -.type poly1305_blocks_avx2,\@function,4 -.align 32 -poly1305_blocks_avx2: .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len - jae .Lblocks_avx2 + jae .Lblocks_avx2$suffix test %r8d,%r8d jz .Lblocks -.Lblocks_avx2: +.Lblocks_avx2$suffix: and \$-16,$len - jz .Lno_data_avx2 + jz .Lno_data_avx2$suffix vzeroupper test %r8d,%r8d - jz .Lbase2_64_avx2 + jz .Lbase2_64_avx2$suffix test \$63,$len - jz .Leven_avx2 + jz .Leven_avx2$suffix - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2: .cfi_push %r14 push %r15 .cfi_push %r15 -.Lblocks_avx2_body: +.Lblocks_avx2_body$suffix: mov $len,%r15 # reassign $len @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2: shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) -.Lbase2_26_pre_avx2: +.Lbase2_26_pre_avx2$suffix: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2: mov $r1,%rax test \$63,%r15 - jnz .Lbase2_26_pre_avx2 + jnz .Lbase2_26_pre_avx2$suffix test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format ################################# base 2^64 -> base 2^26 mov $h0,%rax @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2: or $r1,$h2 # h[4] test %r15,%r15 - jz .Lstore_base2_26_avx2 + jz .Lstore_base2_26_avx2$suffix vmovd %rax#d,%x#$H0 vmovd %rdx#d,%x#$H1 vmovd $h0#d,%x#$H2 vmovd $h1#d,%x#$H3 vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2 + jmp .Lproceed_avx2$suffix .align 32 -.Lstore_base2_64_avx2: +.Lstore_base2_64_avx2$suffix: mov $h0,0($ctx) mov $h1,8($ctx) mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2 + jmp .Ldone_avx2$suffix .align 16 -.Lstore_base2_26_avx2: +.Lstore_base2_26_avx2$suffix: mov %rax#d,0($ctx) # store hash value base 2^26 mov %rdx#d,4($ctx) mov $h0#d,8($ctx) mov $h1#d,12($ctx) mov $h2#d,16($ctx) .align 16 -.Ldone_avx2: - mov 0(%rsp),%r15 +.Ldone_avx2$suffix: + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data_avx2: -.Lblocks_avx2_epilogue: + pop %rbp +.cfi_restore %rbp +.Lno_data_avx2$suffix: +.Lblocks_avx2_epilogue$suffix: ret .cfi_endproc .align 32 -.Lbase2_64_avx2: +.Lbase2_64_avx2$suffix: .cfi_startproc - push %rbx -.cfi_push %rbx push %rbp .cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx push %r12 .cfi_push %r12 push %r13 @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2: .cfi_push %r14 push %r15 .cfi_push %r15 -.Lbase2_64_avx2_body: +.Lbase2_64_avx2_body$suffix: mov $len,%r15 # reassign $len @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2: add $r1,$s1 # s1 = r1 + (r1 >> 2) test \$63,$len - jz .Linit_avx2 + jz .Linit_avx2$suffix -.Lbase2_64_pre_avx2: +.Lbase2_64_pre_avx2$suffix: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2: mov $r1,%rax test \$63,%r15 - jnz .Lbase2_64_pre_avx2 + jnz .Lbase2_64_pre_avx2$suffix -.Linit_avx2: +.Linit_avx2$suffix: ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2: call __poly1305_init_avx -.Lproceed_avx2: +.Lproceed_avx2$suffix: mov %r15,$len # restore $len - mov OPENSSL_ia32cap_P+8(%rip),%r10d +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d mov \$`(1<<31|1<<30|1<<16)`,%r11d - - mov 0(%rsp),%r15 +___ +$code.=<<___; + pop %r15 .cfi_restore %r15 - mov 8(%rsp),%r14 + pop %r14 .cfi_restore %r14 - mov 16(%rsp),%r13 + pop %r13 .cfi_restore %r13 - mov 24(%rsp),%r12 + pop %r12 .cfi_restore %r12 - mov 32(%rsp),%rbp -.cfi_restore %rbp - mov 40(%rsp),%rbx + pop %rbx .cfi_restore %rbx - lea 48(%rsp),%rax - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lbase2_64_avx2_epilogue: - jmp .Ldo_avx2 + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx2_epilogue$suffix: + jmp .Ldo_avx2$suffix .cfi_endproc .align 32 -.Leven_avx2: +.Leven_avx2$suffix: .cfi_startproc - mov OPENSSL_ia32cap_P+8(%rip),%r10d +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d +___ +$code.=<<___; vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 vmovd 4*3($ctx),%x#$H3 vmovd 4*4($ctx),%x#$H4 -.Ldo_avx2: +.Ldo_avx2$suffix: ___ -$code.=<<___ if ($avx>2); +$code.=<<___ if (!$kernel && $avx>2); cmp \$512,$len jb .Lskip_avx512 - and %r11d,%r10d - test \$`1<<16`,%r10d # check for AVX512F + and %r11d,%r9d + test \$`1<<16`,%r9d # check for AVX512F jnz .Lblocks_avx512 -.Lskip_avx512: +.Lskip_avx512$suffix: +___ +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); + cmp \$512,$len + jae .Lblocks_avx512 ___ $code.=<<___ if (!$win64); - lea -8(%rsp),%r11 -.cfi_def_cfa %r11,16 + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 + lea 8(%rsp),%r10 sub \$0x1c8,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx2_body: + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx2_body$suffix: ___ $code.=<<___; lea .Lconst(%rip),%rcx @@ -1794,11 +1901,11 @@ $code.=<<___; vpaddq $H2,$T2,$H2 # accumulate input sub \$64,$len - jz .Ltail_avx2 - jmp .Loop_avx2 + jz .Ltail_avx2$suffix + jmp .Loop_avx2$suffix .align 32 -.Loop_avx2: +.Loop_avx2$suffix: ################################################################ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 @@ -1946,10 +2053,10 @@ $code.=<<___; vpor 32(%rcx),$T4,$T4 # padbit, yes, always sub \$64,$len - jnz .Loop_avx2 + jnz .Loop_avx2$suffix .byte 0x66,0x90 -.Ltail_avx2: +.Ltail_avx2$suffix: ################################################################ # while above multiplications were by r^4 in all lanes, in last # iteration we multiply least significant lane by r^4 and most @@ -2087,37 +2194,29 @@ $code.=<<___; vmovd %x#$H4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx2_epilogue: + vmovdqa -0xb0(%r10),%xmm6 + vmovdqa -0xa0(%r10),%xmm7 + vmovdqa -0x90(%r10),%xmm8 + vmovdqa -0x80(%r10),%xmm9 + vmovdqa -0x70(%r10),%xmm10 + vmovdqa -0x60(%r10),%xmm11 + vmovdqa -0x50(%r10),%xmm12 + vmovdqa -0x40(%r10),%xmm13 + vmovdqa -0x30(%r10),%xmm14 + vmovdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx2_epilogue$suffix: ___ $code.=<<___ if (!$win64); - lea 8(%r11),%rsp -.cfi_def_cfa %rsp,8 + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp ___ $code.=<<___; vzeroupper ret .cfi_endproc -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - +if($avx > 2 && $avx512) { my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30"; @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); map(s/%y/%z/,($MASK)); $code.=<<___; -.type poly1305_blocks_avx512,\@function,4 -.align 32 -poly1305_blocks_avx512: .cfi_startproc .Lblocks_avx512: mov \$15,%eax kmovw %eax,%k2 ___ $code.=<<___ if (!$win64); - lea -8(%rsp),%r11 -.cfi_def_cfa %r11,16 + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 + lea 8(%rsp),%r10 sub \$0x1c8,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) .Ldo_avx512_body: ___ $code.=<<___; @@ -2679,7 +2775,7 @@ $code.=<<___; lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 add \$64,$len - jnz .Ltail_avx2 + jnz .Ltail_avx2$suffix vpsubq $T2,$H2,$H2 # undo input accumulation vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced @@ -2690,29 +2786,61 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movdqa 0x50(%r11),%xmm6 - movdqa 0x60(%r11),%xmm7 - movdqa 0x70(%r11),%xmm8 - movdqa 0x80(%r11),%xmm9 - movdqa 0x90(%r11),%xmm10 - movdqa 0xa0(%r11),%xmm11 - movdqa 0xb0(%r11),%xmm12 - movdqa 0xc0(%r11),%xmm13 - movdqa 0xd0(%r11),%xmm14 - movdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp + movdqa -0xb0(%r10),%xmm6 + movdqa -0xa0(%r10),%xmm7 + movdqa -0x90(%r10),%xmm8 + movdqa -0x80(%r10),%xmm9 + movdqa -0x70(%r10),%xmm10 + movdqa -0x60(%r10),%xmm11 + movdqa -0x50(%r10),%xmm12 + movdqa -0x40(%r10),%xmm13 + movdqa -0x30(%r10),%xmm14 + movdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp .Ldo_avx512_epilogue: ___ $code.=<<___ if (!$win64); - lea 8(%r11),%rsp -.cfi_def_cfa %rsp,8 + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp ___ $code.=<<___; ret .cfi_endproc -.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ -if ($avx>3) { + +} + +} + +&declare_function("poly1305_blocks_avx2", 32, 4); +poly1305_blocks_avxN(0); +&end_function("poly1305_blocks_avx2"); + +if($kernel) { + $code .= "#endif\n"; +} + +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +if($kernel) { + $code .= "#ifdef CONFIG_AS_AVX512\n"; +} + +&declare_function("poly1305_blocks_avx512", 32, 4); +poly1305_blocks_avxN(1); +&end_function("poly1305_blocks_avx512"); + +if ($kernel) { + $code .= "#endif\n"; +} + +if (!$kernel && $avx>3) { ######################################################################## # VPMADD52 version using 2^44 radix. # @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44: .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ } -$code.=<<___; -.asciz "Poly1305 for x86_64, CRYPTOGAMS by " -.align 16 -___ +if (!$kernel) { # chacha20-poly1305 helpers my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order @@ -4038,17 +4130,17 @@ avx_handler: .section .pdata .align 4 - .rva .LSEH_begin_poly1305_init - .rva .LSEH_end_poly1305_init - .rva .LSEH_info_poly1305_init + .rva .LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_end_poly1305_init_x86_64 + .rva .LSEH_info_poly1305_init_x86_64 - .rva .LSEH_begin_poly1305_blocks - .rva .LSEH_end_poly1305_blocks - .rva .LSEH_info_poly1305_blocks + .rva .LSEH_begin_poly1305_blocks_x86_64 + .rva .LSEH_end_poly1305_blocks_x86_64 + .rva .LSEH_info_poly1305_blocks_x86_64 - .rva .LSEH_begin_poly1305_emit - .rva .LSEH_end_poly1305_emit - .rva .LSEH_info_poly1305_emit + .rva .LSEH_begin_poly1305_emit_x86_64 + .rva .LSEH_end_poly1305_emit_x86_64 + .rva .LSEH_info_poly1305_emit_x86_64 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_poly1305_blocks_avx @@ -4088,20 +4180,20 @@ ___ $code.=<<___; .section .xdata .align 8 -.LSEH_info_poly1305_init: +.LSEH_info_poly1305_init_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 -.LSEH_info_poly1305_blocks: +.LSEH_info_poly1305_blocks_x86_64: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_body,.Lblocks_epilogue -.LSEH_info_poly1305_emit: +.LSEH_info_poly1305_emit_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 ___ $code.=<<___ if ($avx); .LSEH_info_poly1305_blocks_avx_1: @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2); ___ } +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + foreach (split('\n',$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/%r([a-z]+)#d/%e$1/g; s/%r([0-9]+)#d/%r$1d/g; s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + if ($kernel) { + s/(^\.type.*),[0-9]+$/\1/; + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; + next if /^\.cfi.*/; + } + print $_,"\n"; } close STDOUT; diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index edb7113e36f3..657363588e0c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -1,8 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ #include @@ -13,279 +11,170 @@ #include #include #include +#include #include -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, - const u32 *r, unsigned int blocks); -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); +asmlinkage void poly1305_init_x86_64(void *ctx, + const u8 key[POLY1305_KEY_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; -static inline u64 mlt(u64 a, u64 b) +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) { - return a * b; -} + struct poly1305_arch_internal *state = ctx; + u32 cy; -static inline u32 sr(u64 v, u_char n) -{ - return v >> n; -} + if (!state->is_base2_26) + return; -static inline u32 and(u32 v, u32 mask) -{ - return v & mask; + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; } -static void poly1305_simd_mult(u32 *a, const u32 *b) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) { - u8 m[POLY1305_BLOCK_SIZE]; - - memset(m, 0, sizeof(m)); - /* The poly1305 block function adds a hi-bit to the accumulator which - * we don't need for key multiplication; compensate for it. */ - a[4] -= 1 << 24; - poly1305_block_sse2(a, m, b, 1); + poly1305_init_x86_64(ctx, key); } -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key) +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, + const u32 padbit) { - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; -} + struct poly1305_arch_internal *state = ctx; -static void poly1305_integer_blocks(struct poly1305_state *state, - const struct poly1305_key *key, - const void *src, - unsigned int nblocks, u32 hibit) -{ - u32 r0, r1, r2, r3, r4; - u32 s1, s2, s3, s4; - u32 h0, h1, h2, h3, h4; - u64 d0, d1, d2, d3, d4; + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || + PAGE_SIZE % POLY1305_BLOCK_SIZE); - if (!nblocks) + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || + !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_blocks_x86_64(ctx, inp, len, padbit); return; + } - r0 = key->r[0]; - r1 = key->r[1]; - r2 = key->r[2]; - r3 = key->r[3]; - r4 = key->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - do { - /* h += m[i] */ - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); - - /* h *= r */ - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + - mlt(h3, s2) + mlt(h4, s1); - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + - mlt(h3, s3) + mlt(h4, s2); - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + - mlt(h3, s4) + mlt(h4, s3); - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + - mlt(h3, r0) + mlt(h4, s4); - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + - mlt(h3, r1) + mlt(h4, r0); - - /* (partial) h %= p */ - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; - - src += POLY1305_BLOCK_SIZE; - } while (--nblocks); - - state->h[0] = h0; - state->h[1] = h1; - state->h[2] = h2; - state->h[3] = h3; - state->h[4] = h4; + for (;;) { + const size_t bytes = min_t(size_t, len, PAGE_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + len -= bytes; + if (!len) + break; + inp += bytes; + } } -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst) +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) { - u32 h0, h1, h2, h3, h4; - u32 g0, g1, g2, g3, g4; - u32 mask; - - /* fully carry h */ - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; - - /* compute h + -p */ - g0 = h0 + 5; - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); + struct poly1305_arch_internal *state = ctx; + + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + !state->is_base2_26 || !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_emit_x86_64(ctx, mac, nonce); + } else + poly1305_emit_avx(ctx, mac, nonce); } -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) { - poly1305_integer_setkey(desc->opaque_r, key); - desc->s[0] = get_unaligned_le32(key + 16); - desc->s[1] = get_unaligned_le32(key + 20); - desc->s[2] = get_unaligned_le32(key + 24); - desc->s[3] = get_unaligned_le32(key + 28); - poly1305_core_init(&desc->h); - desc->buflen = 0; - desc->sset = true; - desc->rset = 1; + poly1305_simd_init(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(&key[16]); + dctx->s[1] = get_unaligned_le32(&key[20]); + dctx->s[2] = get_unaligned_le32(&key[24]); + dctx->s[3] = get_unaligned_le32(&key[28]); + dctx->buflen = 0; + dctx->sset = true; } -EXPORT_SYMBOL_GPL(poly1305_init_arch); +EXPORT_SYMBOL(poly1305_init_arch); -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, + const u8 *inp, unsigned int len) { - if (!dctx->sset) { - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { - poly1305_integer_setkey(dctx->r, src); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; + unsigned int acc = 0; + if (unlikely(!dctx->sset)) { + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { + poly1305_simd_init(&dctx->h, inp); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; dctx->rset = 1; } - if (srclen >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(src + 0); - dctx->s[1] = get_unaligned_le32(src + 4); - dctx->s[2] = get_unaligned_le32(src + 8); - dctx->s[3] = get_unaligned_le32(src + 12); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(&inp[0]); + dctx->s[1] = get_unaligned_le32(&inp[4]); + dctx->s[2] = get_unaligned_le32(&inp[8]); + dctx->s[3] = get_unaligned_le32(&inp[12]); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; dctx->sset = true; } } - return srclen; -} - -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - unsigned int datalen; - - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src, - srclen / POLY1305_BLOCK_SIZE, 1); - srclen %= POLY1305_BLOCK_SIZE; - } - return srclen; -} - -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - unsigned int blocks, datalen; - - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } - - if (IS_ENABLED(CONFIG_AS_AVX2) && - static_branch_likely(&poly1305_use_avx2) && - srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(dctx->rset < 4)) { - if (dctx->rset < 2) { - dctx->r[1] = dctx->r[0]; - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); - } - dctx->r[2] = dctx->r[1]; - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); - dctx->r[3] = dctx->r[2]; - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); - dctx->rset = 4; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, - dctx->r[1].r); - src += POLY1305_BLOCK_SIZE * 4 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(dctx->rset < 2)) { - dctx->r[1] = dctx->r[0]; - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); - dctx->rset = 2; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, - blocks, dctx->r[1].r); - src += POLY1305_BLOCK_SIZE * 2 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); - srclen -= POLY1305_BLOCK_SIZE; - } - return srclen; + return acc; } void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { - unsigned int bytes; + unsigned int bytes, used; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (static_branch_likely(&poly1305_use_simd) && - likely(crypto_simd_usable())) { - kernel_fpu_begin(); - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); - kernel_fpu_end(); - } else { - poly1305_scalar_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); - } + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - if (static_branch_likely(&poly1305_use_simd) && - likely(crypto_simd_usable())) { - kernel_fpu_begin(); - bytes = poly1305_simd_blocks(dctx, src, srclen); - kernel_fpu_end(); - } else { - bytes = poly1305_scalar_blocks(dctx, src, srclen); - } - src += srclen - bytes; - srclen = bytes; + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + srclen -= bytes; + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(bytes - used)) + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); + src += bytes; } if (unlikely(srclen)) { @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, } EXPORT_SYMBOL(poly1305_update_arch); -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst) +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) { - __le32 digest[4]; - u64 f = 0; - - if (unlikely(desc->buflen)) { - desc->buf[desc->buflen++] = 1; - memset(desc->buf + desc->buflen, 0, - POLY1305_BLOCK_SIZE - desc->buflen); - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0); + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); } - poly1305_integer_emit(&desc->h, digest); - - /* mac = (h + s) % (2^128) */ - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; - put_unaligned_le32(f, dst + 0); - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; - put_unaligned_le32(f, dst + 4); - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; - put_unaligned_le32(f, dst + 8); - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; - put_unaligned_le32(f, dst + 12); - - *desc = (struct poly1305_desc_ctx){}; + poly1305_simd_emit(&dctx->h, dst, dctx->s); + *dctx = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final_arch); @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct shash_desc *desc) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - poly1305_core_init(&dctx->h); - dctx->buflen = 0; - dctx->rset = 0; - dctx->sset = false; - + *dctx = (struct poly1305_desc_ctx){}; return 0; } -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +static int crypto_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); + poly1305_update_arch(dctx, src, srclen); return 0; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - poly1305_update_arch(dctx, src, srclen); + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, .init = crypto_poly1305_init, - .update = poly1305_simd_update, + .update = crypto_poly1305_update, .final = crypto_poly1305_final, .descsize = sizeof(struct poly1305_desc_ctx), .base = { @@ -406,17 +265,19 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2)) - return 0; - - static_branch_enable(&poly1305_use_simd); - - if (IS_ENABLED(CONFIG_AS_AVX2) && - boot_cpu_has(X86_FEATURE_AVX) && + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx2); - + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init); module_exit(poly1305_simd_mod_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); +MODULE_AUTHOR("Jason A. Donenfeld "); MODULE_DESCRIPTION("Poly1305 authenticator"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 0b2c4fce26d9..14c032de276e 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -90,7 +90,7 @@ config CRYPTO_LIB_DES config CRYPTO_LIB_POLY1305_RSIZE int default 2 if MIPS - default 4 if X86_64 + default 11 if X86_64 default 9 if ARM || ARM64 default 1 -- cgit v1.2.3 From 41419a289010836bd759bf7e254fe041a3dc52d2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Jan 2020 19:57:29 -0800 Subject: crypto: x86/sha - Eliminate casts on asm implementations In order to avoid CFI function prototype mismatches, this removes the casts on assembly implementations of sha1/256/512 accelerators. The safety checks from BUILD_BUG_ON() remain. Additionally, this renames various arguments for clarity, as suggested by Eric Biggers. Signed-off-by: Kees Cook Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_avx2_x86_64_asm.S | 6 +-- arch/x86/crypto/sha1_ssse3_asm.S | 14 ++++--- arch/x86/crypto/sha1_ssse3_glue.c | 70 +++++++++++++++------------------- arch/x86/crypto/sha256-avx-asm.S | 4 +- arch/x86/crypto/sha256-avx2-asm.S | 4 +- arch/x86/crypto/sha256-ssse3-asm.S | 6 ++- arch/x86/crypto/sha256_ssse3_glue.c | 34 ++++++++--------- arch/x86/crypto/sha512-avx-asm.S | 11 +++--- arch/x86/crypto/sha512-avx2-asm.S | 11 +++--- arch/x86/crypto/sha512-ssse3-asm.S | 13 ++++--- arch/x86/crypto/sha512_ssse3_glue.c | 31 ++++++++------- 11 files changed, 102 insertions(+), 102 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 6decc85ef7b7..1e594d60afa5 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -62,11 +62,11 @@ *Visit http://software.intel.com/en-us/articles/ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ * - *Updates 20-byte SHA-1 record in 'hash' for even number of - *'num_blocks' consecutive 64-byte blocks + *Updates 20-byte SHA-1 record at start of 'state', from 'input', for + *even number of 'blocks' consecutive 64-byte blocks. * *extern "C" void sha1_transform_avx2( - * int *hash, const char* input, size_t num_blocks ); + * struct sha1_state *state, const u8* input, int blocks ); */ #include diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 5d03c1173690..12e2d19d7402 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -457,9 +457,13 @@ W_PRECALC_SSSE3 movdqu \a,\b .endm -/* SSSE3 optimized implementation: - * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, - * unsigned int rounds); +/* + * SSSE3 optimized implementation: + * + * extern "C" void sha1_transform_ssse3(struct sha1_state *state, + * const u8 *data, int blocks); + * + * Note that struct sha1_state is assumed to begin with u32 state[5]. */ SHA1_VECTOR_ASM sha1_transform_ssse3 @@ -545,8 +549,8 @@ W_PRECALC_AVX /* AVX optimized implementation: - * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, - * unsigned int rounds); + * extern "C" void sha1_transform_avx(struct sha1_state *state, + * const u8 *data, int blocks); */ SHA1_VECTOR_ASM sha1_transform_avx diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 639d4c2fd6a8..d70b40ad594c 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -27,11 +27,8 @@ #include #include -typedef void (sha1_transform_fn)(u32 *digest, const char *data, - unsigned int rounds); - static int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_transform_fn *sha1_xform) + unsigned int len, sha1_block_fn *sha1_xform) { struct sha1_state *sctx = shash_desc_ctx(desc); @@ -39,48 +36,47 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) return crypto_sha1_update(desc, data, len); - /* make sure casting to sha1_block_fn() is safe */ + /* + * Make sure struct sha1_state begins directly with the SHA1 + * 160-bit internal state, as this is what the asm functions expect. + */ BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); kernel_fpu_begin(); - sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_xform); + sha1_base_do_update(desc, data, len, sha1_xform); kernel_fpu_end(); return 0; } static int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) + unsigned int len, u8 *out, sha1_block_fn *sha1_xform) { if (!crypto_simd_usable()) return crypto_sha1_finup(desc, data, len, out); kernel_fpu_begin(); if (len) - sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_xform); - sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform); + sha1_base_do_update(desc, data, len, sha1_xform); + sha1_base_do_finalize(desc, sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); } -asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_transform_ssse3(struct sha1_state *state, + const u8 *data, int blocks); static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_transform_ssse3); + return sha1_update(desc, data, len, sha1_transform_ssse3); } static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_transform_ssse3); + return sha1_finup(desc, data, len, out, sha1_transform_ssse3); } /* Add padding and return the message digest. */ @@ -119,21 +115,19 @@ static void unregister_sha1_ssse3(void) } #ifdef CONFIG_AS_AVX -asmlinkage void sha1_transform_avx(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_transform_avx(struct sha1_state *state, + const u8 *data, int blocks); static int sha1_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_transform_avx); + return sha1_update(desc, data, len, sha1_transform_avx); } static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_transform_avx); + return sha1_finup(desc, data, len, out, sha1_transform_avx); } static int sha1_avx_final(struct shash_desc *desc, u8 *out) @@ -190,8 +184,8 @@ static inline void unregister_sha1_avx(void) { } #if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX) #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ -asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks); static bool avx2_usable(void) { @@ -203,28 +197,26 @@ static bool avx2_usable(void) return false; } -static void sha1_apply_transform_avx2(u32 *digest, const char *data, - unsigned int rounds) +static void sha1_apply_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks) { /* Select the optimal transform based on data block size */ - if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(digest, data, rounds); + if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(state, data, blocks); else - sha1_transform_avx(digest, data, rounds); + sha1_transform_avx(state, data, blocks); } static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_apply_transform_avx2); + return sha1_update(desc, data, len, sha1_apply_transform_avx2); } static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_apply_transform_avx2); + return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); } static int sha1_avx2_final(struct shash_desc *desc, u8 *out) @@ -267,21 +259,19 @@ static inline void unregister_sha1_avx2(void) { } #endif #ifdef CONFIG_AS_SHA1_NI -asmlinkage void sha1_ni_transform(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, + int rounds); static int sha1_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_ni_transform); + return sha1_update(desc, data, len, sha1_ni_transform); } static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_ni_transform); + return sha1_finup(desc, data, len, out, sha1_ni_transform); } static int sha1_ni_final(struct shash_desc *desc, u8 *out) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 22e14c8dd2e4..fcbc30f58c38 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -341,8 +341,8 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to digest +## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) +## arg 1 : pointer to state ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 519b551ad576..499d9ec129de 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -520,8 +520,8 @@ STACK_SIZE = _RSP + _RSP_SIZE .endm ######################################################################## -## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to digest +## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) +## arg 1 : pointer to state ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 69cc2f91dc4c..ddfa863b4ee3 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -347,8 +347,10 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to digest +## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, +## int blocks); +## arg 1 : pointer to state +## (struct sha256_state is assumed to begin with u32 state[8]) ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index f9aff31fe59e..03ad657c04bd 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -41,12 +41,11 @@ #include #include -asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, - u64 rounds); -typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds); +asmlinkage void sha256_transform_ssse3(struct sha256_state *state, + const u8 *data, int blocks); static int _sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha256_transform_fn *sha256_xform) + unsigned int len, sha256_block_fn *sha256_xform) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -54,28 +53,29 @@ static int _sha256_update(struct shash_desc *desc, const u8 *data, (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) return crypto_sha256_update(desc, data, len); - /* make sure casting to sha256_block_fn() is safe */ + /* + * Make sure struct sha256_state begins directly with the SHA256 + * 256-bit internal state, as this is what the asm functions expect. + */ BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); kernel_fpu_begin(); - sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_xform); + sha256_base_do_update(desc, data, len, sha256_xform); kernel_fpu_end(); return 0; } static int sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) + unsigned int len, u8 *out, sha256_block_fn *sha256_xform) { if (!crypto_simd_usable()) return crypto_sha256_finup(desc, data, len, out); kernel_fpu_begin(); if (len) - sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_xform); - sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform); + sha256_base_do_update(desc, data, len, sha256_xform); + sha256_base_do_finalize(desc, sha256_xform); kernel_fpu_end(); return sha256_base_finish(desc, out); @@ -145,8 +145,8 @@ static void unregister_sha256_ssse3(void) } #ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(u32 *digest, const char *data, - u64 rounds); +asmlinkage void sha256_transform_avx(struct sha256_state *state, + const u8 *data, int blocks); static int sha256_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) @@ -227,8 +227,8 @@ static inline void unregister_sha256_avx(void) { } #endif #if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) -asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, - u64 rounds); +asmlinkage void sha256_transform_rorx(struct sha256_state *state, + const u8 *data, int blocks); static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) @@ -307,8 +307,8 @@ static inline void unregister_sha256_avx2(void) { } #endif #ifdef CONFIG_AS_SHA256_NI -asmlinkage void sha256_ni_transform(u32 *digest, const char *data, - u64 rounds); /*unsigned int rounds);*/ +asmlinkage void sha256_ni_transform(struct sha256_state *digest, + const u8 *data, int rounds); static int sha256_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 3704ddd7e5d5..90ea945ba5e6 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -271,11 +271,12 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_avx(void* D, const void* M, u64 L) -# Purpose: Updates the SHA512 digest stored at D with the message stored in M. -# The size of the message pointed to by M must be an integer multiple of SHA512 -# message blocks. -# L is the message length in SHA512 blocks +# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_avx) cmp $0, msglen diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 80d830e7ee09..3dd886b14e7d 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -563,11 +563,12 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_rorx(void* D, const void* M, uint64_t L)# -# Purpose: Updates the SHA512 digest stored at D with the message stored in M. -# The size of the message pointed to by M must be an integer multiple of SHA512 -# message blocks. -# L is the message length in SHA512 blocks +# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_rorx) # Allocate Stack Space diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 838f984e95d9..7946a1bee85b 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -269,11 +269,14 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_ssse3(void* D, const void* M, u64 L)# -# Purpose: Updates the SHA512 digest stored at D with the message stored in M. -# The size of the message pointed to by M must be an integer multiple of SHA512 -# message blocks. -# L is the message length in SHA512 blocks. +## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, +## int blocks); +# (struct sha512_state is assumed to begin with u64 state[8]) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks. ######################################################################## SYM_FUNC_START(sha512_transform_ssse3) diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 458356a3f124..1c444f41037c 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -39,13 +39,11 @@ #include #include -asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, - u64 rounds); - -typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds); +asmlinkage void sha512_transform_ssse3(struct sha512_state *state, + const u8 *data, int blocks); static int sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha512_transform_fn *sha512_xform) + unsigned int len, sha512_block_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); @@ -53,28 +51,29 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); - /* make sure casting to sha512_block_fn() is safe */ + /* + * Make sure struct sha512_state begins directly with the SHA512 + * 512-bit internal state, as this is what the asm functions expect. + */ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); - sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_xform); + sha512_base_do_update(desc, data, len, sha512_xform); kernel_fpu_end(); return 0; } static int sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) + unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); if (len) - sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_xform); - sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform); + sha512_base_do_update(desc, data, len, sha512_xform); + sha512_base_do_finalize(desc, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); @@ -144,8 +143,8 @@ static void unregister_sha512_ssse3(void) } #ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(u64 *digest, const char *data, - u64 rounds); +asmlinkage void sha512_transform_avx(struct sha512_state *state, + const u8 *data, int blocks); static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { @@ -225,8 +224,8 @@ static inline void unregister_sha512_avx(void) { } #endif #if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) -asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, - u64 rounds); +asmlinkage void sha512_transform_rorx(struct sha512_state *state, + const u8 *data, int blocks); static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) -- cgit v1.2.3 From 1f6868995326cc82102049e349d8dbd116bdb656 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 16 Jan 2020 18:23:55 +0100 Subject: crypto: x86/poly1305 - fix .gitignore typo Admist the kbuild robot induced changes, the .gitignore file for the generated file wasn't updated with the non-clashing filename. This commit adjusts that. Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/x86/crypto/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore index c406ea6571fa..30be0400a439 100644 --- a/arch/x86/crypto/.gitignore +++ b/arch/x86/crypto/.gitignore @@ -1 +1 @@ -poly1305-x86_64.S +poly1305-x86_64-cryptogams.S -- cgit v1.2.3 From f9e7fe32a792726186301423ff63a465d63386e1 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 17 Jan 2020 11:42:22 +0100 Subject: crypto: x86/poly1305 - emit does base conversion itself The emit code does optional base conversion itself in assembly, so we don't need to do that here. Also, neither one of these functions uses simd instructions, so checking for that doesn't make sense either. Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305_glue.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 657363588e0c..79bb58737d52 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -123,13 +123,9 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { - struct poly1305_arch_internal *state = ctx; - - if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || - !state->is_base2_26 || !crypto_simd_usable()) { - convert_to_base2_64(ctx); + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx)) poly1305_emit_x86_64(ctx, mac, nonce); - } else + else poly1305_emit_avx(ctx, mac, nonce); } -- cgit v1.2.3