diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 13:40:17 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 13:40:17 -0700 |
commit | bbce2ad2d711c12d93145a7bbdf086e73f414bcd (patch) | |
tree | 35432a39f68f4c5df44ed38037cbf05adadb923e /arch | |
parent | 0f776dc377f6c87f4e4d4a5f63602f33fb93b31e (diff) | |
parent | 0f95e2ffc58f5d32a90eb1051d17aeebc21cf91d (diff) | |
download | linux-bbce2ad2d711c12d93145a7bbdf086e73f414bcd.tar.bz2 |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"Here is the crypto update for 4.8:
API:
- first part of skcipher low-level conversions
- add KPP (Key-agreement Protocol Primitives) interface.
Algorithms:
- fix IPsec/cryptd reordering issues that affects aesni
- RSA no longer does explicit leading zero removal
- add SHA3
- add DH
- add ECDH
- improve DRBG performance by not doing CTR by hand
Drivers:
- add x86 AVX2 multibuffer SHA256/512
- add POWER8 optimised crc32c
- add xts support to vmx
- add DH support to qat
- add RSA support to caam
- add Layerscape support to caam
- add SEC1 AEAD support to talitos
- improve performance by chaining requests in marvell/cesa
- add support for Araneus Alea I USB RNG
- add support for Broadcom BCM5301 RNG
- add support for Amlogic Meson RNG
- add support Broadcom NSP SoC RNG"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (180 commits)
crypto: vmx - Fix aes_p8_xts_decrypt build failure
crypto: vmx - Ignore generated files
crypto: vmx - Adding support for XTS
crypto: vmx - Adding asm subroutines for XTS
crypto: skcipher - add comment for skcipher_alg->base
crypto: testmgr - Print akcipher algorithm name
crypto: marvell - Fix wrong flag used for GFP in mv_cesa_dma_add_iv_op
crypto: nx - off by one bug in nx_of_update_msc()
crypto: rsa-pkcs1pad - fix rsa-pkcs1pad request struct
crypto: scatterwalk - Inline start/map/done
crypto: scatterwalk - Remove unnecessary BUG in scatterwalk_start
crypto: scatterwalk - Remove unnecessary advance in scatterwalk_pagedone
crypto: scatterwalk - Fix test in scatterwalk_done
crypto: api - Optimise away crypto_yield when hard preemption is on
crypto: scatterwalk - add no-copy support to copychunks
crypto: scatterwalk - Remove scatterwalk_bytes_sglen
crypto: omap - Stop using crypto scatterwalk_bytes_sglen
crypto: skcipher - Remove top-level givcipher interface
crypto: user - Remove crypto_lookup_skcipher call
crypto: cts - Convert to skcipher
...
Diffstat (limited to 'arch')
47 files changed, 7582 insertions, 291 deletions
diff --git a/arch/arm/boot/dts/bcm-nsp.dtsi b/arch/arm/boot/dts/bcm-nsp.dtsi index def9e783b5c6..1ed829e699d4 100644 --- a/arch/arm/boot/dts/bcm-nsp.dtsi +++ b/arch/arm/boot/dts/bcm-nsp.dtsi @@ -206,6 +206,11 @@ brcm,nand-has-wp; }; + rng: rng@33000 { + compatible = "brcm,bcm-nsp-rng"; + reg = <0x33000 0x14>; + }; + ccbtimer0: timer@34000 { compatible = "arm,sp804"; reg = <0x34000 0x1000>; diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 03a39fe29246..1568cb5cd870 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -154,30 +154,23 @@ static int ghash_async_init(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - if (!may_use_simd()) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - return crypto_ahash_init(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - desc->flags = req->base.flags; - return crypto_shash_init(desc); - } + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); } static int ghash_async_update(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!may_use_simd()) { - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - + if (!may_use_simd() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_update(cryptd_req); @@ -190,12 +183,12 @@ static int ghash_async_update(struct ahash_request *req) static int ghash_async_final(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!may_use_simd()) { - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - + if (!may_use_simd() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_final(cryptd_req); @@ -212,7 +205,8 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!may_use_simd()) { + if (!may_use_simd() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_digest(cryptd_req); diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts index f895fc02ab06..40846319be69 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts @@ -49,6 +49,10 @@ / { model = "LS1043A RDB Board"; + + aliases { + crypto = &crypto; + }; }; &i2c0 { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index de0323b48b1e..6bd46c133010 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -159,6 +159,49 @@ big-endian; }; + crypto: crypto@1700000 { + compatible = "fsl,sec-v5.4", "fsl,sec-v5.0", + "fsl,sec-v4.0"; + fsl,sec-era = <3>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x00 0x1700000 0x100000>; + reg = <0x00 0x1700000 0x0 0x100000>; + interrupts = <0 75 0x4>; + + sec_jr0: jr@10000 { + compatible = "fsl,sec-v5.4-job-ring", + "fsl,sec-v5.0-job-ring", + "fsl,sec-v4.0-job-ring"; + reg = <0x10000 0x10000>; + interrupts = <0 71 0x4>; + }; + + sec_jr1: jr@20000 { + compatible = "fsl,sec-v5.4-job-ring", + "fsl,sec-v5.0-job-ring", + "fsl,sec-v4.0-job-ring"; + reg = <0x20000 0x10000>; + interrupts = <0 72 0x4>; + }; + + sec_jr2: jr@30000 { + compatible = "fsl,sec-v5.4-job-ring", + "fsl,sec-v5.0-job-ring", + "fsl,sec-v4.0-job-ring"; + reg = <0x30000 0x10000>; + interrupts = <0 73 0x4>; + }; + + sec_jr3: jr@40000 { + compatible = "fsl,sec-v5.4-job-ring", + "fsl,sec-v5.0-job-ring", + "fsl,sec-v4.0-job-ring"; + reg = <0x40000 0x10000>; + interrupts = <0 74 0x4>; + }; + }; + dcfg: dcfg@1ee0000 { compatible = "fsl,ls1043a-dcfg", "syscon"; reg = <0x0 0x1ee0000 0x0 0x10000>; diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 44be1e03ed65..9b6e408cfa51 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -174,13 +174,15 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); #define iounmap __iounmap /* - * io{read,write}{16,32}be() macros + * io{read,write}{16,32,64}be() macros */ #define ioread16be(p) ({ __u16 __v = be16_to_cpu((__force __be16)__raw_readw(p)); __iormb(); __v; }) #define ioread32be(p) ({ __u32 __v = be32_to_cpu((__force __be32)__raw_readl(p)); __iormb(); __v; }) +#define ioread64be(p) ({ __u64 __v = be64_to_cpu((__force __be64)__raw_readq(p)); __iormb(); __v; }) #define iowrite16be(v,p) ({ __iowmb(); __raw_writew((__force __u16)cpu_to_be16(v), p); }) #define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); }) +#define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); }) /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 9c221b69c181..7998c177f0a2 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -9,9 +9,11 @@ obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o +obj-$(CONFIG_CRYPT_CRC32C_VPMSUM) += crc32c-vpmsum.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o sha1-powerpc-y := sha1-powerpc-asm.o sha1.o sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o +crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o diff --git a/arch/powerpc/crypto/aes-spe-regs.h b/arch/powerpc/crypto/aes-spe-regs.h index 30d217b399c3..2cc3a2caadae 100644 --- a/arch/powerpc/crypto/aes-spe-regs.h +++ b/arch/powerpc/crypto/aes-spe-regs.h @@ -18,7 +18,7 @@ #define rLN r7 /* length of data to be processed */ #define rIP r8 /* potiner to IV (CBC/CTR/XTS modes) */ #define rKT r9 /* pointer to tweak key (XTS mode) */ -#define rT0 r11 /* pointers to en-/decrpytion tables */ +#define rT0 r11 /* pointers to en-/decryption tables */ #define rT1 r10 #define rD0 r9 /* data */ #define rD1 r14 diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S new file mode 100644 index 000000000000..dc640b212299 --- /dev/null +++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S @@ -0,0 +1,1553 @@ +/* + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> + + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#define MAX_SIZE 32768 +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + +.barrett_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 + + .text + +#if defined(__BIG_ENDIAN__) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(__crc32c_vpmsum) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, R3) + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 + + vand v0,v0,mask_64bit + + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ + + /* Get it into r3 */ + MFVRD(R3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(__crc32_vpmsum) diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c new file mode 100644 index 000000000000..bfe3d37a24ef --- /dev/null +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -0,0 +1,167 @@ +#include <linux/crc32.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <asm/switch_to.h> + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 512 + +u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len); + +static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt()) + return __crc32c_le(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = __crc32c_le(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + pagefault_disable(); + enable_kernel_altivec(); + crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + pagefault_enable(); + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = __crc32c_le(crc, p, tail); + } + + return crc; +} + +static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_vpmsum_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32c_vpmsum(*crcp, data, len); + + return 0; +} + +static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len)); + + return 0; +} + +static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = ~cpu_to_le32p(crcp); + + return 0; +} + +static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static struct shash_alg alg = { + .setkey = crc32c_vpmsum_setkey, + .init = crc32c_vpmsum_init, + .update = crc32c_vpmsum_update, + .final = crc32c_vpmsum_final, + .finup = crc32c_vpmsum_finup, + .digest = crc32c_vpmsum_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-vpmsum", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32c_vpmsum_cra_init, + } +}; + +static int __init crc32c_vpmsum_mod_init(void) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crc32c_vpmsum_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crc32c_vpmsum_mod_init); +module_exit(crc32c_vpmsum_mod_fini); + +MODULE_AUTHOR("Anton Blanchard <anton@samba.org>"); +MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-vpmsum"); diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 1d035c1cc889..49cd8760aa7c 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -174,6 +174,8 @@ #define PPC_INST_MFSPR_DSCR_USER_MASK 0xfc1fffff #define PPC_INST_MTSPR_DSCR_USER 0x7c0303a6 #define PPC_INST_MTSPR_DSCR_USER_MASK 0xfc1fffff +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 #define PPC_INST_SLBFEE 0x7c0007a7 #define PPC_INST_STRING 0x7c00042a @@ -188,6 +190,8 @@ #define PPC_INST_WAIT 0x7c00007c #define PPC_INST_TLBIVAX 0x7c000624 #define PPC_INST_TLBSRX_DOT 0x7c0006a5 +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 #define PPC_INST_XXLOR 0xf0000510 #define PPC_INST_XXSWAPD 0xf0000250 #define PPC_INST_XVCPSGNDP 0xf0000780 @@ -359,6 +363,14 @@ VSX_XX1((s), a, b)) #define LXVD2X(s, a, b) stringify_in_c(.long PPC_INST_LXVD2X | \ VSX_XX1((s), a, b)) +#define MFVRD(a, t) stringify_in_c(.long PPC_INST_MFVSRD | \ + VSX_XX1((t)+32, a, R0)) +#define MTVRD(t, a) stringify_in_c(.long PPC_INST_MTVSRD | \ + VSX_XX1((t)+32, a, R0)) +#define VPMSUMW(t, a, b) stringify_in_c(.long PPC_INST_VPMSUMW | \ + VSX_XX3((t), a, b)) +#define VPMSUMD(t, a, b) stringify_in_c(.long PPC_INST_VPMSUMD | \ + VSX_XX3((t), a, b)) #define XXLOR(t, a, b) stringify_in_c(.long PPC_INST_XXLOR | \ VSX_XX3((t), a, b)) #define XXSWAPD(t, a) stringify_in_c(.long PPC_INST_XXSWAPD | \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 2b31632376a5..051af612a7e1 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -286,6 +286,9 @@ n: #endif +#define FUNC_START(name) _GLOBAL(name) +#define FUNC_END(name) + /* * LOAD_REG_IMMEDIATE(rn, expr) * Loads the value of the constant expression 'expr' into register 'rn' diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c index 12e48d56f771..3963f0b68d52 100644 --- a/arch/powerpc/kernel/iomap.c +++ b/arch/powerpc/kernel/iomap.c @@ -38,6 +38,18 @@ EXPORT_SYMBOL(ioread16); EXPORT_SYMBOL(ioread16be); EXPORT_SYMBOL(ioread32); EXPORT_SYMBOL(ioread32be); +#ifdef __powerpc64__ +u64 ioread64(void __iomem *addr) +{ + return readq(addr); +} +u64 ioread64be(void __iomem *addr) +{ + return readq_be(addr); +} +EXPORT_SYMBOL(ioread64); +EXPORT_SYMBOL(ioread64be); +#endif /* __powerpc64__ */ void iowrite8(u8 val, void __iomem *addr) { @@ -64,6 +76,18 @@ EXPORT_SYMBOL(iowrite16); EXPORT_SYMBOL(iowrite16be); EXPORT_SYMBOL(iowrite32); EXPORT_SYMBOL(iowrite32be); +#ifdef __powerpc64__ +void iowrite64(u64 val, void __iomem *addr) +{ + writeq(val, addr); +} +void iowrite64be(u64 val, void __iomem *addr) +{ + writeq_be(val, addr); +} +EXPORT_SYMBOL(iowrite64); +EXPORT_SYMBOL(iowrite64be); +#endif /* __powerpc64__ */ /* * These are the "repeat read/write" functions. Note the diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 7554a8bb2adc..2ea18b050309 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -22,6 +22,7 @@ #include <crypto/aes.h> #include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/module.h> #include <linux/cpufeature.h> @@ -44,7 +45,7 @@ struct s390_aes_ctx { long dec; int key_len; union { - struct crypto_blkcipher *blk; + struct crypto_skcipher *blk; struct crypto_cipher *cip; } fallback; }; @@ -63,7 +64,7 @@ struct s390_xts_ctx { long enc; long dec; int key_len; - struct crypto_blkcipher *fallback; + struct crypto_skcipher *fallback; }; /* @@ -237,16 +238,16 @@ static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key, struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); unsigned int ret; - sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; - sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); + crypto_skcipher_clear_flags(sctx->fallback.blk, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + ret = crypto_skcipher_setkey(sctx->fallback.blk, key, len); + + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm->crt_flags |= crypto_skcipher_get_flags(sctx->fallback.blk) & + CRYPTO_TFM_RES_MASK; - ret = crypto_blkcipher_setkey(sctx->fallback.blk, key, len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } return ret; } @@ -255,15 +256,17 @@ static int fallback_blk_dec(struct blkcipher_desc *desc, unsigned int nbytes) { unsigned int ret; - struct crypto_blkcipher *tfm; - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk); - tfm = desc->tfm; - desc->tfm = sctx->fallback.blk; + skcipher_request_set_tfm(req, sctx->fallback.blk); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes); + ret = crypto_skcipher_decrypt(req); - desc->tfm = tfm; + skcipher_request_zero(req); return ret; } @@ -272,15 +275,15 @@ static int fallback_blk_enc(struct blkcipher_desc *desc, unsigned int nbytes) { unsigned int ret; - struct crypto_blkcipher *tfm; - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk); - tfm = desc->tfm; - desc->tfm = sctx->fallback.blk; + skcipher_request_set_tfm(req, sctx->fallback.blk); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); - - desc->tfm = tfm; + ret = crypto_skcipher_encrypt(req); return ret; } @@ -370,8 +373,9 @@ static int fallback_init_blk(struct crypto_tfm *tfm) const char *name = tfm->__crt_alg->cra_name; struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - sctx->fallback.blk = crypto_alloc_blkcipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + sctx->fallback.blk = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(sctx->fallback.blk)) { pr_err("Allocating AES fallback algorithm %s failed\n", @@ -386,8 +390,7 @@ static void fallback_exit_blk(struct crypto_tfm *tfm) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(sctx->fallback.blk); - sctx->fallback.blk = NULL; + crypto_free_skcipher(sctx->fallback.blk); } static struct crypto_alg ecb_aes_alg = { @@ -536,16 +539,16 @@ static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key, struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); unsigned int ret; - xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; - xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); + crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len); + + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm->crt_flags |= crypto_skcipher_get_flags(xts_ctx->fallback) & + CRYPTO_TFM_RES_MASK; - ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } return ret; } @@ -553,16 +556,18 @@ static int xts_fallback_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); - struct crypto_blkcipher *tfm; + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback); unsigned int ret; - tfm = desc->tfm; - desc->tfm = xts_ctx->fallback; + skcipher_request_set_tfm(req, xts_ctx->fallback); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes); + ret = crypto_skcipher_decrypt(req); - desc->tfm = tfm; + skcipher_request_zero(req); return ret; } @@ -570,16 +575,18 @@ static int xts_fallback_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); - struct crypto_blkcipher *tfm; + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback); unsigned int ret; - tfm = desc->tfm; - desc->tfm = xts_ctx->fallback; + skcipher_request_set_tfm(req, xts_ctx->fallback); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); + ret = crypto_skcipher_encrypt(req); - desc->tfm = tfm; + skcipher_request_zero(req); return ret; } @@ -700,8 +707,9 @@ static int xts_fallback_init(struct crypto_tfm *tfm) const char *name = tfm->__crt_alg->cra_name; struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); - xts_ctx->fallback = crypto_alloc_blkcipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + xts_ctx->fallback = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(xts_ctx->fallback)) { pr_err("Allocating XTS fallback algorithm %s failed\n", @@ -715,8 +723,7 @@ static void xts_fallback_exit(struct crypto_tfm *tfm) { struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(xts_ctx->fallback); - xts_ctx->fallback = NULL; + crypto_free_skcipher(xts_ctx->fallback); } static struct crypto_alg xts_aes_alg = { diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b9b912a44d61..34b3fa2889d1 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -49,7 +49,9 @@ endif ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/ + obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/ + obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/ + obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/ endif aes-i586-y := aes-i586-asm_32.o aes_glue.o diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 5b7fa1471007..0ab5ee1c26af 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -59,17 +59,6 @@ struct aesni_rfc4106_gcm_ctx { u8 nonce[4]; }; -struct aesni_gcm_set_hash_subkey_result { - int err; - struct completion completion; -}; - -struct aesni_hash_subkey_req_data { - u8 iv[16]; - struct aesni_gcm_set_hash_subkey_result result; - struct scatterlist sg; -}; - struct aesni_lrw_ctx { struct lrw_table_ctx lrw_table; u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; @@ -809,71 +798,28 @@ static void rfc4106_exit(struct crypto_aead *aead) cryptd_free_aead(*ctx); } -static void -rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err) -{ - struct aesni_gcm_set_hash_subkey_result *result = req->data; - - if (err == -EINPROGRESS) - return; - result->err = err; - complete(&result->completion); -} - static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { - struct crypto_ablkcipher *ctr_tfm; - struct ablkcipher_request *req; - int ret = -EINVAL; - struct aesni_hash_subkey_req_data *req_data; + struct crypto_cipher *tfm; + int ret; - ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0); - if (IS_ERR(ctr_tfm)) - return PTR_ERR(ctr_tfm); + tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); - ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); + ret = crypto_cipher_setkey(tfm, key, key_len); if (ret) - goto out_free_ablkcipher; - - ret = -ENOMEM; - req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); - if (!req) - goto out_free_ablkcipher; - - req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); - if (!req_data) - goto out_free_request; - - memset(req_data->iv, 0, sizeof(req_data->iv)); + goto out_free_cipher; /* Clear the data in the hash sub key container to zero.*/ /* We want to cipher all zeros to create the hash sub key. */ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - init_completion(&req_data->result.completion); - sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE); - ablkcipher_request_set_tfm(req, ctr_tfm); - ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - rfc4106_set_hash_subkey_done, - &req_data->result); - - ablkcipher_request_set_crypt(req, &req_data->sg, - &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv); - - ret = crypto_ablkcipher_encrypt(req); - if (ret == -EINPROGRESS || ret == -EBUSY) { - ret = wait_for_completion_interruptible - (&req_data->result.completion); - if (!ret) - ret = req_data->result.err; - } - kfree(req_data); -out_free_request: - ablkcipher_request_free(req); -out_free_ablkcipher: - crypto_free_ablkcipher(ctr_tfm); + crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey); + +out_free_cipher: + crypto_free_cipher(tfm); return ret; } @@ -1098,9 +1044,12 @@ static int rfc4106_encrypt(struct aead_request *req) struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - aead_request_set_tfm(req, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); + tfm = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + tfm = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, tfm); return crypto_aead_encrypt(req); } @@ -1111,9 +1060,12 @@ static int rfc4106_decrypt(struct aead_request *req) struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - aead_request_set_tfm(req, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); + tfm = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + tfm = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, tfm); return crypto_aead_decrypt(req); } diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 2d5c2e0bd939..f910d1d449f0 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -70,7 +70,7 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, struct blkcipher_walk walk; int err; - if (!may_use_simd()) + if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd()) return crypto_chacha20_crypt(desc, dst, src, nbytes); state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index a69321a77783..0420bab19efb 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -168,30 +168,23 @@ static int ghash_async_init(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - if (!irq_fpu_usable()) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - return crypto_ahash_init(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - desc->flags = req->base.flags; - return crypto_shash_init(desc); - } + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); } static int ghash_async_update(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable()) { - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - + if (!irq_fpu_usable() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_update(cryptd_req); @@ -204,12 +197,12 @@ static int ghash_async_update(struct ahash_request *req) static int ghash_async_final(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable()) { - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - + if (!irq_fpu_usable() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_final(cryptd_req); @@ -249,7 +242,8 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable()) { + if (!irq_fpu_usable() || + (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_digest(cryptd_req); diff --git a/arch/x86/crypto/sha-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile index 2f8756375df5..2f8756375df5 100644 --- a/arch/x86/crypto/sha-mb/Makefile +++ b/arch/x86/crypto/sha1-mb/Makefile diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index 9c5af331a956..9e5b67127a09 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -67,7 +67,7 @@ #include <asm/byteorder.h> #include <linux/hardirq.h> #include <asm/fpu/api.h> -#include "sha_mb_ctx.h" +#include "sha1_mb_ctx.h" #define FLUSH_INTERVAL 1000 /* in usec */ @@ -77,30 +77,34 @@ struct sha1_mb_ctx { struct mcryptd_ahash *mcryptd_tfm; }; -static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx) +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx) { - struct shash_desc *desc; + struct ahash_request *areq; - desc = container_of((void *) hash_ctx, struct shash_desc, __ctx); - return container_of(desc, struct mcryptd_hash_request_ctx, desc); + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); } -static inline struct ahash_request *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) { return container_of((void *) ctx, struct ahash_request, __ctx); } static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, - struct shash_desc *desc) + struct ahash_request *areq) { rctx->flag = HASH_UPDATE; } static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state); -static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *state, - struct job_sha1 *job); -static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state); -static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_submit) + (struct sha1_mb_mgr *state, struct job_sha1 *job); +static asmlinkage struct job_sha1* (*sha1_job_mgr_flush) + (struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job) + (struct sha1_mb_mgr *state); static inline void sha1_init_digest(uint32_t *digest) { @@ -131,7 +135,8 @@ static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], return i >> SHA1_LOG2_BLOCK_SIZE; } -static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, struct sha1_hash_ctx *ctx) +static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, + struct sha1_hash_ctx *ctx) { while (ctx) { if (ctx->status & HASH_CTX_STS_COMPLETE) { @@ -177,8 +182,8 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str ctx->job.buffer = (uint8_t *) buffer; ctx->job.len = len; - ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, - &ctx->job); + ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr, + &ctx->job); continue; } } @@ -191,13 +196,15 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str if (ctx->status & HASH_CTX_STS_LAST) { uint8_t *buf = ctx->partial_block_buffer; - uint32_t n_extra_blocks = sha1_pad(buf, ctx->total_length); + uint32_t n_extra_blocks = + sha1_pad(buf, ctx->total_length); ctx->status = (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); ctx->job.buffer = buf; ctx->job.len = (uint32_t) n_extra_blocks; - ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + ctx = (struct sha1_hash_ctx *) + sha1_job_mgr_submit(&mgr->mgr, &ctx->job); continue; } @@ -208,14 +215,17 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str return NULL; } -static struct sha1_hash_ctx *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr) +static struct sha1_hash_ctx + *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr) { /* * If get_comp_job returns NULL, there are no jobs complete. - * If get_comp_job returns a job, verify that it is safe to return to the user. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. * If it is not ready, resubmit the job to finish processing. * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. - * Otherwise, all jobs currently being managed by the hash_ctx_mgr still need processing. + * Otherwise, all jobs currently being managed by the hash_ctx_mgr + * still need processing. */ struct sha1_hash_ctx *ctx; @@ -235,7 +245,10 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, int flags) { if (flags & (~HASH_ENTIRE)) { - /* User should not pass anything other than FIRST, UPDATE, or LAST */ + /* + * User should not pass anything other than FIRST, UPDATE, or + * LAST + */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; return ctx; } @@ -264,14 +277,20 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, ctx->partial_block_buffer_length = 0; } - /* If we made it here, there were no errors during this call to submit */ + /* + * If we made it here, there were no errors during this call to + * submit + */ ctx->error = HASH_CTX_ERROR_NONE; /* Store buffer ptr info from user */ ctx->incoming_buffer = buffer; ctx->incoming_buffer_length = len; - /* Store the user's request flags and mark this ctx as currently being processed. */ + /* + * Store the user's request flags and mark this ctx as currently + * being processed. + */ ctx->status = (flags & HASH_LAST) ? (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : HASH_CTX_STS_PROCESSING; @@ -285,9 +304,13 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, * Or if the user's buffer contains less than a whole block, * append as much as possible to the extra block. */ - if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { - /* Compute how many bytes to copy from user buffer into extra block */ - uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) { + /* + * Compute how many bytes to copy from user buffer into + * extra block + */ + uint32_t copy_len = SHA1_BLOCK_SIZE - + ctx->partial_block_buffer_length; if (len < copy_len) copy_len = len; @@ -297,20 +320,28 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, buffer, copy_len); ctx->partial_block_buffer_length += copy_len; - ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); ctx->incoming_buffer_length = len - copy_len; } - /* The extra block should never contain more than 1 block here */ + /* + * The extra block should never contain more than 1 block + * here + */ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); - /* If the extra block buffer contains exactly 1 block, it can be hashed. */ + /* + * If the extra block buffer contains exactly 1 block, it can + * be hashed. + */ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { ctx->partial_block_buffer_length = 0; ctx->job.buffer = ctx->partial_block_buffer; ctx->job.len = 1; - ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + ctx = (struct sha1_hash_ctx *) + sha1_job_mgr_submit(&mgr->mgr, &ctx->job); } } @@ -329,23 +360,24 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr) return NULL; /* - * If flush returned a job, resubmit the job to finish processing. + * If flush returned a job, resubmit the job to finish + * processing. */ ctx = sha1_ctx_mgr_resubmit(mgr, ctx); /* - * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. - * Otherwise, all jobs currently being managed by the sha1_ctx_mgr - * still need processing. Loop. + * If sha1_ctx_mgr_resubmit returned a job, it is ready to be + * returned. Otherwise, all jobs currently being managed by the + * sha1_ctx_mgr still need processing. Loop. */ if (ctx) return ctx; } } -static int sha1_mb_init(struct shash_desc *desc) +static int sha1_mb_init(struct ahash_request *areq) { - struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); hash_ctx_init(sctx); sctx->job.result_digest[0] = SHA1_H0; @@ -363,7 +395,7 @@ static int sha1_mb_init(struct shash_desc *desc) static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx) { int i; - struct sha1_hash_ctx *sctx = shash_desc_ctx(&rctx->desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); __be32 *dst = (__be32 *) rctx->out; for (i = 0; i < 5; ++i) @@ -394,9 +426,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, flag |= HASH_LAST; } - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(&rctx->desc); + sha_ctx = (struct sha1_hash_ctx *) + ahash_request_ctx(&rctx->areq); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); if (!sha_ctx) { if (flush) sha_ctx = sha1_ctx_mgr_flush(cstate->mgr); @@ -485,11 +519,10 @@ static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx, mcryptd_arm_flusher(cstate, delay); } -static int sha1_mb_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha1_mb_update(struct ahash_request *areq) { struct mcryptd_hash_request_ctx *rctx = - container_of(desc, struct mcryptd_hash_request_ctx, desc); + container_of(areq, struct mcryptd_hash_request_ctx, areq); struct mcryptd_alg_cstate *cstate = this_cpu_ptr(sha1_mb_alg_state.alg_cstate); @@ -505,7 +538,7 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data, } /* need to init context */ - req_ctx_init(rctx, desc); + req_ctx_init(rctx, areq); nbytes = crypto_ahash_walk_first(req, &rctx->walk); @@ -518,10 +551,11 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data, rctx->flag |= HASH_DONE; /* submit */ - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); sha1_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); kernel_fpu_end(); /* check if anything is returned */ @@ -544,11 +578,10 @@ done: return ret; } -static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha1_mb_finup(struct ahash_request *areq) { struct mcryptd_hash_request_ctx *rctx = - container_of(desc, struct mcryptd_hash_request_ctx, desc); + container_of(areq, struct mcryptd_hash_request_ctx, areq); struct mcryptd_alg_cstate *cstate = this_cpu_ptr(sha1_mb_alg_state.alg_cstate); @@ -563,7 +596,7 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, } /* need to init context */ - req_ctx_init(rctx, desc); + req_ctx_init(rctx, areq); nbytes = crypto_ahash_walk_first(req, &rctx->walk); @@ -576,15 +609,15 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, rctx->flag |= HASH_DONE; flag = HASH_LAST; } - rctx->out = out; /* submit */ rctx->flag |= HASH_FINAL; - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); sha1_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); kernel_fpu_end(); /* check if anything is returned */ @@ -605,10 +638,10 @@ done: return ret; } -static int sha1_mb_final(struct shash_desc *desc, u8 *out) +static int sha1_mb_final(struct ahash_request *areq) { struct mcryptd_hash_request_ctx *rctx = - container_of(desc, struct mcryptd_hash_request_ctx, desc); + container_of(areq, struct mcryptd_hash_request_ctx, areq); struct mcryptd_alg_cstate *cstate = this_cpu_ptr(sha1_mb_alg_state.alg_cstate); @@ -623,16 +656,16 @@ static int sha1_mb_final(struct shash_desc *desc, u8 *out) } /* need to init context */ - req_ctx_init(rctx, desc); + req_ctx_init(rctx, areq); - rctx->out = out; rctx->flag |= HASH_DONE | HASH_FINAL; - sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); /* flag HASH_FINAL and 0 data size */ sha1_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, HASH_LAST); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); kernel_fpu_end(); /* check if anything is returned */ @@ -654,48 +687,98 @@ done: return ret; } -static int sha1_mb_export(struct shash_desc *desc, void *out) +static int sha1_mb_export(struct ahash_request *areq, void *out) { - struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); memcpy(out, sctx, sizeof(*sctx)); return 0; } -static int sha1_mb_import(struct shash_desc *desc, const void *in) +static int sha1_mb_import(struct ahash_request *areq, const void *in) { - struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); memcpy(sctx, in, sizeof(*sctx)); return 0; } +static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; -static struct shash_alg sha1_mb_shash_alg = { - .digestsize = SHA1_DIGEST_SIZE, + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha1_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha1_hash_ctx)); + + return 0; +} + +static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha1_mb_areq_alg = { .init = sha1_mb_init, .update = sha1_mb_update, .final = sha1_mb_final, .finup = sha1_mb_finup, .export = sha1_mb_export, .import = sha1_mb_import, - .descsize = sizeof(struct sha1_hash_ctx), - .statesize = sizeof(struct sha1_hash_ctx), - .base = { - .cra_name = "__sha1-mb", - .cra_driver_name = "__intel_sha1-mb", - .cra_priority = 100, - /* - * use ASYNC flag as some buffers in multi-buffer - * algo may not have completed before hashing thread sleep - */ - .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), + .halg = { + .digestsize = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_hash_ctx), + .base = { + .cra_name = "__sha1-mb", + .cra_driver_name = "__intel_sha1-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha1_mb_areq_alg.halg.base.cra_list), + .cra_init = sha1_mb_areq_init_tfm, + .cra_exit = sha1_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha1_hash_ctx), + } } }; @@ -780,46 +863,20 @@ static int sha1_mb_async_import(struct ahash_request *req, const void *in) struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; - struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm); + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); struct mcryptd_hash_request_ctx *rctx; - struct shash_desc *desc; + struct ahash_request *areq; memcpy(mcryptd_req, req, sizeof(*req)); ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); rctx = ahash_request_ctx(mcryptd_req); - desc = &rctx->desc; - desc->tfm = child; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - return crypto_ahash_import(mcryptd_req, in); -} - -static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) -{ - struct mcryptd_ahash *mcryptd_tfm; - struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); - struct mcryptd_hash_ctx *mctx; + areq = &rctx->areq; - mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(mcryptd_tfm)) - return PTR_ERR(mcryptd_tfm); - mctx = crypto_ahash_ctx(&mcryptd_tfm->base); - mctx->alg_state = &sha1_mb_alg_state; - ctx->mcryptd_tfm = mcryptd_tfm; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ahash_request) + - crypto_ahash_reqsize(&mcryptd_tfm->base)); + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); - return 0; -} - -static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm) -{ - struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); - - mcryptd_free_ahash(ctx->mcryptd_tfm); + return crypto_ahash_import(mcryptd_req, in); } static struct ahash_alg sha1_mb_async_alg = { @@ -866,7 +923,8 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) if (time_before(cur_time, rctx->tag.expire)) break; kernel_fpu_begin(); - sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); + sha_ctx = (struct sha1_hash_ctx *) + sha1_ctx_mgr_flush(cstate->mgr); kernel_fpu_end(); if (!sha_ctx) { pr_err("sha1_mb error: nothing got flushed for non-empty list\n"); @@ -927,7 +985,7 @@ static int __init sha1_mb_mod_init(void) } sha1_mb_alg_state.flusher = &sha1_mb_flusher; - err = crypto_register_shash(&sha1_mb_shash_alg); + err = crypto_register_ahash(&sha1_mb_areq_alg); if (err) goto err2; err = crypto_register_ahash(&sha1_mb_async_alg); @@ -937,7 +995,7 @@ static int __init sha1_mb_mod_init(void) return 0; err1: - crypto_unregister_shash(&sha1_mb_shash_alg); + crypto_unregister_ahash(&sha1_mb_areq_alg); err2: for_each_possible_cpu(cpu) { cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); @@ -953,7 +1011,7 @@ static void __exit sha1_mb_mod_fini(void) struct mcryptd_alg_cstate *cpu_state; crypto_unregister_ahash(&sha1_mb_async_alg); - crypto_unregister_shash(&sha1_mb_shash_alg); + crypto_unregister_ahash(&sha1_mb_areq_alg); for_each_possible_cpu(cpu) { cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); kfree(cpu_state->mgr); diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h index e36069d0c1bd..98a35bcc6f4a 100644 --- a/arch/x86/crypto/sha-mb/sha_mb_ctx.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -54,7 +54,7 @@ #ifndef _SHA_MB_CTX_INTERNAL_H #define _SHA_MB_CTX_INTERNAL_H -#include "sha_mb_mgr.h" +#include "sha1_mb_mgr.h" #define HASH_UPDATE 0x00 #define HASH_FIRST 0x01 diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h index 08ad1a9acfd7..08ad1a9acfd7 100644 --- a/arch/x86/crypto/sha-mb/sha_mb_mgr.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S index 86688c6e7a25..86688c6e7a25 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S index 96df6a39d7e2..96df6a39d7e2 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c index 822acb5b464c..d2add0d35f43 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c @@ -51,7 +51,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "sha_mb_mgr.h" +#include "sha1_mb_mgr.h" void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) { diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S index 63a0d9c8e31f..63a0d9c8e31f 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S index c9dae1cd2919..c9dae1cd2919 100644 --- a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 1024e378a358..fc61739150e7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -374,3 +374,9 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha1"); +MODULE_ALIAS_CRYPTO("sha1-ssse3"); +MODULE_ALIAS_CRYPTO("sha1-avx"); +MODULE_ALIAS_CRYPTO("sha1-avx2"); +#ifdef CONFIG_AS_SHA1_NI +MODULE_ALIAS_CRYPTO("sha1-ni"); +#endif diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile new file mode 100644 index 000000000000..41089e7c400c --- /dev/null +++ b/arch/x86/crypto/sha256-mb/Makefile @@ -0,0 +1,11 @@ +# +# Arch-specific CryptoAPI modules. +# + +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o + sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \ + sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o +endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c new file mode 100644 index 000000000000..89fa85e8b10c --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -0,0 +1,1030 @@ +/* + * Multi buffer SHA256 algorithm Glue Code + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/list.h> +#include <crypto/scatterwalk.h> +#include <crypto/sha.h> +#include <crypto/mcryptd.h> +#include <crypto/crypto_wq.h> +#include <asm/byteorder.h> +#include <linux/hardirq.h> +#include <asm/fpu/api.h> +#include "sha256_mb_ctx.h" + +#define FLUSH_INTERVAL 1000 /* in usec */ + +static struct mcryptd_alg_state sha256_mb_alg_state; + +struct sha256_mb_ctx { + struct mcryptd_ahash *mcryptd_tfm; +}; + +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx) +{ + struct ahash_request *areq; + + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); +} + +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +{ + return container_of((void *) ctx, struct ahash_request, __ctx); +} + +static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, + struct ahash_request *areq) +{ + rctx->flag = HASH_UPDATE; +} + +static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state); +static asmlinkage struct job_sha256* (*sha256_job_mgr_submit) + (struct sha256_mb_mgr *state, struct job_sha256 *job); +static asmlinkage struct job_sha256* (*sha256_job_mgr_flush) + (struct sha256_mb_mgr *state); +static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job) + (struct sha256_mb_mgr *state); + +inline void sha256_init_digest(uint32_t *digest) +{ + static const uint32_t initial_digest[SHA256_DIGEST_LENGTH] = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7}; + memcpy(digest, initial_digest, sizeof(initial_digest)); +} + +inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], + uint32_t total_len) +{ + uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1); + + memset(&padblock[i], 0, SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((SHA256_BLOCK_SIZE - 1) & + (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) &padblock[i - 16]) = 0; +#endif + + *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3); + + /* Number of extra blocks to hash */ + return i >> SHA256_LOG2_BLOCK_SIZE; +} + +static struct sha256_hash_ctx + *sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr, + struct sha256_hash_ctx *ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Clear PROCESSING bit */ + ctx->status = HASH_CTX_STS_COMPLETE; + return ctx; + } + + /* + * If the extra blocks are empty, begin hashing what remains + * in the user's buffer. + */ + if (ctx->partial_block_buffer_length == 0 && + ctx->incoming_buffer_length) { + + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + uint32_t copy_len; + + /* + * Only entire blocks can be hashed. + * Copy remainder to extra blocks buffer. + */ + copy_len = len & (SHA256_BLOCK_SIZE-1); + + if (copy_len) { + len -= copy_len; + memcpy(ctx->partial_block_buffer, + ((const char *) buffer + len), + copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + /* len should be a multiple of the block size now */ + assert((len % SHA256_BLOCK_SIZE) == 0); + + /* Set len to the number of blocks to be hashed */ + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + } + + /* + * If the extra blocks are not empty, then we are + * either on the last block(s) or we need more + * user input before continuing. + */ + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = + sha256_pad(buf, ctx->total_length); + + ctx->status = (HASH_CTX_STS_PROCESSING | + HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static struct sha256_hash_ctx + *sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr) +{ + /* + * If get_comp_job returns NULL, there are no jobs complete. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. If it is not ready, resubmit the job to finish processing. + * If sha256_ctx_mgr_resubmit returned a job, it is ready to be + * returned. Otherwise, all jobs currently being managed by the + * hash_ctx_mgr still need processing. + */ + struct sha256_hash_ctx *ctx; + + ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr); + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr) +{ + sha256_job_mgr_init(&mgr->mgr); +} + +static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr, + struct sha256_hash_ctx *ctx, + const void *buffer, + uint32_t len, + int flags) +{ + if (flags & (~HASH_ENTIRE)) { + /* User should not pass anything other than FIRST, UPDATE + * or LAST + */ + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + /* Cannot submit to a currently processing job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + /* Cannot update a finished job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + /* Init digest */ + sha256_init_digest(ctx->job.result_digest); + + /* Reset byte counter */ + ctx->total_length = 0; + + /* Clear extra blocks */ + ctx->partial_block_buffer_length = 0; + } + + /* If we made it here, there was no error during this call to submit */ + ctx->error = HASH_CTX_ERROR_NONE; + + /* Store buffer ptr info from user */ + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + /* + * Store the user's request flags and mark this ctx as currently + * being processed. + */ + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + /* Advance byte counter */ + ctx->total_length += len; + + /* + * If there is anything currently buffered in the extra blocks, + * append to it until it contains a whole block. + * Or if the user's buffer contains less than a whole block, + * append as much as possible to the extra block. + */ + if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) { + /* + * Compute how many bytes to copy from user buffer into + * extra block + */ + uint32_t copy_len = SHA256_BLOCK_SIZE - + ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + /* Copy and update relevant pointers and counters */ + memcpy( + &ctx->partial_block_buffer[ctx->partial_block_buffer_length], + buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + + /* The extra block should never contain more than 1 block */ + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + /* + * If the extra block buffer contains exactly 1 block, + * it can be hashed. + */ + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_submit(&mgr->mgr, &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr) +{ + struct sha256_hash_ctx *ctx; + + while (1) { + ctx = (struct sha256_hash_ctx *) + sha256_job_mgr_flush(&mgr->mgr); + + /* If flush returned 0, there are no more jobs in flight. */ + if (!ctx) + return NULL; + + /* + * If flush returned a job, resubmit the job to finish + * processing. + */ + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + /* + * If sha256_ctx_mgr_resubmit returned a job, it is ready to + * be returned. Otherwise, all jobs currently being managed by + * the sha256_ctx_mgr still need processing. Loop. + */ + if (ctx) + return ctx; + } +} + +static int sha256_mb_init(struct ahash_request *areq) +{ + struct sha256_hash_ctx *sctx = ahash_request_ctx(areq); + + hash_ctx_init(sctx); + sctx->job.result_digest[0] = SHA256_H0; + sctx->job.result_digest[1] = SHA256_H1; + sctx->job.result_digest[2] = SHA256_H2; + sctx->job.result_digest[3] = SHA256_H3; + sctx->job.result_digest[4] = SHA256_H4; + sctx->job.result_digest[5] = SHA256_H5; + sctx->job.result_digest[6] = SHA256_H6; + sctx->job.result_digest[7] = SHA256_H7; + sctx->total_length = 0; + sctx->partial_block_buffer_length = 0; + sctx->status = HASH_CTX_STS_IDLE; + + return 0; +} + +static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx) +{ + int i; + struct sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); + __be32 *dst = (__be32 *) rctx->out; + + for (i = 0; i < 8; ++i) + dst[i] = cpu_to_be32(sctx->job.result_digest[i]); + + return 0; +} + +static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, + struct mcryptd_alg_cstate *cstate, bool flush) +{ + int flag = HASH_UPDATE; + int nbytes, err = 0; + struct mcryptd_hash_request_ctx *rctx = *ret_rctx; + struct sha256_hash_ctx *sha_ctx; + + /* more work ? */ + while (!(rctx->flag & HASH_DONE)) { + nbytes = crypto_ahash_walk_done(&rctx->walk, 0); + if (nbytes < 0) { + err = nbytes; + goto out; + } + /* check if the walk is done */ + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + if (rctx->flag & HASH_FINAL) + flag |= HASH_LAST; + + } + sha_ctx = (struct sha256_hash_ctx *) + ahash_request_ctx(&rctx->areq); + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); + if (!sha_ctx) { + if (flush) + sha_ctx = sha256_ctx_mgr_flush(cstate->mgr); + } + kernel_fpu_end(); + if (sha_ctx) + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + else { + rctx = NULL; + goto out; + } + } + + /* copy the results */ + if (rctx->flag & HASH_FINAL) + sha256_mb_set_results(rctx); + +out: + *ret_rctx = rctx; + return err; +} + +static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate, + int err) +{ + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha256_hash_ctx *sha_ctx; + struct mcryptd_hash_request_ctx *req_ctx; + int ret; + + /* remove from work list */ + spin_lock(&cstate->work_lock); + list_del(&rctx->waiter); + spin_unlock(&cstate->work_lock); + + if (irqs_disabled()) + rctx->complete(&req->base, err); + else { + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + } + + /* check to see if there are other jobs that are done */ + sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr); + while (sha_ctx) { + req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&req_ctx, cstate, false); + if (req_ctx) { + spin_lock(&cstate->work_lock); + list_del(&req_ctx->waiter); + spin_unlock(&cstate->work_lock); + + req = cast_mcryptd_ctx_to_req(req_ctx); + if (irqs_disabled()) + rctx->complete(&req->base, ret); + else { + local_bh_disable(); + rctx->complete(&req->base, ret); + local_bh_enable(); + } + } + sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr); + } + + return 0; +} + +static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate) +{ + unsigned long next_flush; + unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + + /* initialize tag */ + rctx->tag.arrival = jiffies; /* tag the arrival time */ + rctx->tag.seq_num = cstate->next_seq_num++; + next_flush = rctx->tag.arrival + delay; + rctx->tag.expire = next_flush; + + spin_lock(&cstate->work_lock); + list_add_tail(&rctx->waiter, &cstate->work_list); + spin_unlock(&cstate->work_lock); + + mcryptd_arm_flusher(cstate, delay); +} + +static int sha256_mb_update(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha256_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha256_hash_ctx *sha_ctx; + int ret = 0, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) + rctx->flag |= HASH_DONE; + + /* submit */ + sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq); + sha256_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha256_mb_finup(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha256_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha256_hash_ctx *sha_ctx; + int ret = 0, flag = HASH_UPDATE, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + flag = HASH_LAST; + } + + /* submit */ + rctx->flag |= HASH_FINAL; + sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq); + sha256_mb_add_list(rctx, cstate); + + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha256_mb_final(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha256_mb_alg_state.alg_cstate); + + struct sha256_hash_ctx *sha_ctx; + int ret = 0; + u8 data; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + rctx->flag |= HASH_DONE | HASH_FINAL; + + sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq); + /* flag HASH_FINAL and 0 data size */ + sha256_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha256_mb_export(struct ahash_request *areq, void *out) +{ + struct sha256_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha256_mb_import(struct ahash_request *areq, const void *in) +{ + struct sha256_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; + + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha256_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha256_hash_ctx)); + + return 0; +} + +static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha256_mb_areq_alg = { + .init = sha256_mb_init, + .update = sha256_mb_update, + .final = sha256_mb_final, + .finup = sha256_mb_finup, + .export = sha256_mb_export, + .import = sha256_mb_import, + .halg = { + .digestsize = SHA256_DIGEST_SIZE, + .statesize = sizeof(struct sha256_hash_ctx), + .base = { + .cra_name = "__sha256-mb", + .cra_driver_name = "__intel_sha256-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha256_mb_areq_alg.halg.base.cra_list), + .cra_init = sha256_mb_areq_init_tfm, + .cra_exit = sha256_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha256_hash_ctx), + } + } +}; + +static int sha256_mb_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_init(mcryptd_req); +} + +static int sha256_mb_async_update(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_update(mcryptd_req); +} + +static int sha256_mb_async_finup(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_finup(mcryptd_req); +} + +static int sha256_mb_async_final(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_final(mcryptd_req); +} + +static int sha256_mb_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_digest(mcryptd_req); +} + +static int sha256_mb_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_export(mcryptd_req, out); +} + +static int sha256_mb_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); + struct mcryptd_hash_request_ctx *rctx; + struct ahash_request *areq; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + rctx = ahash_request_ctx(mcryptd_req); + areq = &rctx->areq; + + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); + + return crypto_ahash_import(mcryptd_req, in); +} + +static struct ahash_alg sha256_mb_async_alg = { + .init = sha256_mb_async_init, + .update = sha256_mb_async_update, + .final = sha256_mb_async_final, + .finup = sha256_mb_async_finup, + .export = sha256_mb_async_export, + .import = sha256_mb_async_import, + .digest = sha256_mb_async_digest, + .halg = { + .digestsize = SHA256_DIGEST_SIZE, + .statesize = sizeof(struct sha256_hash_ctx), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256_mb", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha256_mb_async_alg.halg.base.cra_list), + .cra_init = sha256_mb_async_init_tfm, + .cra_exit = sha256_mb_async_exit_tfm, + .cra_ctxsize = sizeof(struct sha256_mb_ctx), + .cra_alignmask = 0, + }, + }, +}; + +static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate) +{ + struct mcryptd_hash_request_ctx *rctx; + unsigned long cur_time; + unsigned long next_flush = 0; + struct sha256_hash_ctx *sha_ctx; + + + cur_time = jiffies; + + while (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + if (time_before(cur_time, rctx->tag.expire)) + break; + kernel_fpu_begin(); + sha_ctx = (struct sha256_hash_ctx *) + sha256_ctx_mgr_flush(cstate->mgr); + kernel_fpu_end(); + if (!sha_ctx) { + pr_err("sha256_mb error: nothing got" + " flushed for non-empty list\n"); + break; + } + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + sha_finish_walk(&rctx, cstate, true); + sha_complete_job(rctx, cstate, 0); + } + + if (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + /* get the hash context and then flush time */ + next_flush = rctx->tag.expire; + mcryptd_arm_flusher(cstate, get_delay(next_flush)); + } + return next_flush; +} + +static int __init sha256_mb_mod_init(void) +{ + + int cpu; + int err; + struct mcryptd_alg_cstate *cpu_state; + + /* check for dependent cpu features */ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_BMI2)) + return -ENODEV; + + /* initialize multibuffer structures */ + sha256_mb_alg_state.alg_cstate = alloc_percpu + (struct mcryptd_alg_cstate); + + sha256_job_mgr_init = sha256_mb_mgr_init_avx2; + sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2; + sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2; + sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2; + + if (!sha256_mb_alg_state.alg_cstate) + return -ENOMEM; + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu); + cpu_state->next_flush = 0; + cpu_state->next_seq_num = 0; + cpu_state->flusher_engaged = false; + INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); + cpu_state->cpu = cpu; + cpu_state->alg_state = &sha256_mb_alg_state; + cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr), + GFP_KERNEL); + if (!cpu_state->mgr) + goto err2; + sha256_ctx_mgr_init(cpu_state->mgr); + INIT_LIST_HEAD(&cpu_state->work_list); + spin_lock_init(&cpu_state->work_lock); + } + sha256_mb_alg_state.flusher = &sha256_mb_flusher; + + err = crypto_register_ahash(&sha256_mb_areq_alg); + if (err) + goto err2; + err = crypto_register_ahash(&sha256_mb_async_alg); + if (err) + goto err1; + + + return 0; +err1: + crypto_unregister_ahash(&sha256_mb_areq_alg); +err2: + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha256_mb_alg_state.alg_cstate); + return -ENODEV; +} + +static void __exit sha256_mb_mod_fini(void) +{ + int cpu; + struct mcryptd_alg_cstate *cpu_state; + + crypto_unregister_ahash(&sha256_mb_async_alg); + crypto_unregister_ahash(&sha256_mb_areq_alg); + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha256_mb_alg_state.alg_cstate); +} + +module_init(sha256_mb_mod_init); +module_exit(sha256_mb_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated"); + +MODULE_ALIAS_CRYPTO("sha256"); diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h new file mode 100644 index 000000000000..edd252b73206 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h @@ -0,0 +1,136 @@ +/* + * Header file for multi buffer SHA256 context + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SHA_MB_CTX_INTERNAL_H +#define _SHA_MB_CTX_INTERNAL_H + +#include "sha256_mb_mgr.h" + +#define HASH_UPDATE 0x00 +#define HASH_FIRST 0x01 +#define HASH_LAST 0x02 +#define HASH_ENTIRE 0x03 +#define HASH_DONE 0x04 +#define HASH_FINAL 0x08 + +#define HASH_CTX_STS_IDLE 0x00 +#define HASH_CTX_STS_PROCESSING 0x01 +#define HASH_CTX_STS_LAST 0x02 +#define HASH_CTX_STS_COMPLETE 0x04 + +enum hash_ctx_error { + HASH_CTX_ERROR_NONE = 0, + HASH_CTX_ERROR_INVALID_FLAGS = -1, + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, + +#ifdef HASH_CTX_DEBUG + HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4, +#endif +}; + + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while (0) + + +/* Hash Constants and Typedefs */ +#define SHA256_DIGEST_LENGTH 8 +#define SHA256_LOG2_BLOCK_SIZE 6 + +#define SHA256_PADLENGTHFIELD_SIZE 8 + +#ifdef SHA_MB_DEBUG +#define assert(expr) \ +do { \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) +#else +#define assert(expr) do {} while (0) +#endif + +struct sha256_ctx_mgr { + struct sha256_mb_mgr mgr; +}; + +/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */ + +struct sha256_hash_ctx { + /* Must be at struct offset 0 */ + struct job_sha256 job; + /* status flag */ + int status; + /* error flag */ + int error; + + uint32_t total_length; + const void *incoming_buffer; + uint32_t incoming_buffer_length; + uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; + uint32_t partial_block_buffer_length; + void *user_data; +}; + +#endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h new file mode 100644 index 000000000000..b01ae408c56d --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h @@ -0,0 +1,108 @@ +/* + * Header file for multi buffer SHA256 algorithm manager + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef __SHA_MB_MGR_H +#define __SHA_MB_MGR_H + +#include <linux/types.h> + +#define NUM_SHA256_DIGEST_WORDS 8 + +enum job_sts { STS_UNKNOWN = 0, + STS_BEING_PROCESSED = 1, + STS_COMPLETED = 2, + STS_INTERNAL_ERROR = 3, + STS_ERROR = 4 +}; + +struct job_sha256 { + u8 *buffer; + u32 len; + u32 result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32); + enum job_sts status; + void *user_data; +}; + +/* SHA256 out-of-order scheduler */ + +/* typedef uint32_t sha8_digest_array[8][8]; */ + +struct sha256_args_x8 { + uint32_t digest[8][8]; + uint8_t *data_ptr[8]; +}; + +struct sha256_lane_data { + struct job_sha256 *job_in_lane; +}; + +struct sha256_mb_mgr { + struct sha256_args_x8 args; + + uint32_t lens[8]; + + /* each byte is index (0...7) of unused lanes */ + uint64_t unused_lanes; + /* byte 4 is set to FF as a flag */ + struct sha256_lane_data ldata[8]; +}; + + +#define SHA256_MB_MGR_NUM_LANES_AVX2 8 + +void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state); +struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state, + struct job_sha256 *job); +struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state); +struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state); + +#endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S new file mode 100644 index 000000000000..5c377bac21d0 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S @@ -0,0 +1,304 @@ +/* + * Header file for multi buffer SHA256 algorithm data structure + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# Macros for defining data structures + +# Usage example + +#START_FIELDS # JOB_AES +### name size align +#FIELD _plaintext, 8, 8 # pointer to plaintext +#FIELD _ciphertext, 8, 8 # pointer to ciphertext +#FIELD _IV, 16, 8 # IV +#FIELD _keys, 8, 8 # pointer to keys +#FIELD _len, 4, 4 # length in bytes +#FIELD _status, 4, 4 # status enumeration +#FIELD _user_data, 8, 8 # pointer to user data +#UNION _union, size1, align1, \ +# size2, align2, \ +# size3, align3, \ +# ... +#END_FIELDS +#%assign _JOB_AES_size _FIELD_OFFSET +#%assign _JOB_AES_align _STRUCT_ALIGN + +######################################################################### + +# Alternate "struc-like" syntax: +# STRUCT job_aes2 +# RES_Q .plaintext, 1 +# RES_Q .ciphertext, 1 +# RES_DQ .IV, 1 +# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +# RES_U .union, size1, align1, \ +# size2, align2, \ +# ... +# ENDSTRUCT +# # Following only needed if nesting +# %assign job_aes2_size _FIELD_OFFSET +# %assign job_aes2_align _STRUCT_ALIGN +# +# RES_* macros take a name, a count and an optional alignment. +# The count in in terms of the base size of the macro, and the +# default alignment is the base size. +# The macros are: +# Macro Base size +# RES_B 1 +# RES_W 2 +# RES_D 4 +# RES_Q 8 +# RES_DQ 16 +# RES_Y 32 +# RES_Z 64 +# +# RES_U defines a union. It's arguments are a name and two or more +# pairs of "size, alignment" +# +# The two assigns are only needed if this structure is being nested +# within another. Even if the assigns are not done, one can still use +# STRUCT_NAME_size as the size of the structure. +# +# Note that for nesting, you still need to assign to STRUCT_NAME_size. +# +# The differences between this and using "struc" directly are that each +# type is implicitly aligned to its natural length (although this can be +# over-ridden with an explicit third parameter), and that the structure +# is padded at the end to its overall alignment. +# + +######################################################################### + +#ifndef _DATASTRUCT_ASM_ +#define _DATASTRUCT_ASM_ + +#define SZ8 8*SHA256_DIGEST_WORD_SIZE +#define ROUNDS 64*SZ8 +#define PTR_SZ 8 +#define SHA256_DIGEST_WORD_SIZE 4 +#define MAX_SHA256_LANES 8 +#define SHA256_DIGEST_WORDS 8 +#define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE) +#define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS) +#define SHA256_BLK_SZ 64 + +# START_FIELDS +.macro START_FIELDS + _FIELD_OFFSET = 0 + _STRUCT_ALIGN = 0 +.endm + +# FIELD name size align +.macro FIELD name size align + _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1)) + \name = _FIELD_OFFSET + _FIELD_OFFSET = _FIELD_OFFSET + (\size) +.if (\align > _STRUCT_ALIGN) + _STRUCT_ALIGN = \align +.endif +.endm + +# END_FIELDS +.macro END_FIELDS + _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +.endm + +######################################################################## + +.macro STRUCT p1 +START_FIELDS +.struc \p1 +.endm + +.macro ENDSTRUCT + tmp = _FIELD_OFFSET + END_FIELDS + tmp = (_FIELD_OFFSET - %%tmp) +.if (tmp > 0) + .lcomm tmp +.endif +.endstruc +.endm + +## RES_int name size align +.macro RES_int p1 p2 p3 + name = \p1 + size = \p2 + align = .\p3 + + _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1)) +.align align +.lcomm name size + _FIELD_OFFSET = _FIELD_OFFSET + (size) +.if (align > _STRUCT_ALIGN) + _STRUCT_ALIGN = align +.endif +.endm + +# macro RES_B name, size [, align] +.macro RES_B _name, _size, _align=1 +RES_int _name _size _align +.endm + +# macro RES_W name, size [, align] +.macro RES_W _name, _size, _align=2 +RES_int _name 2*(_size) _align +.endm + +# macro RES_D name, size [, align] +.macro RES_D _name, _size, _align=4 +RES_int _name 4*(_size) _align +.endm + +# macro RES_Q name, size [, align] +.macro RES_Q _name, _size, _align=8 +RES_int _name 8*(_size) _align +.endm + +# macro RES_DQ name, size [, align] +.macro RES_DQ _name, _size, _align=16 +RES_int _name 16*(_size) _align +.endm + +# macro RES_Y name, size [, align] +.macro RES_Y _name, _size, _align=32 +RES_int _name 32*(_size) _align +.endm + +# macro RES_Z name, size [, align] +.macro RES_Z _name, _size, _align=64 +RES_int _name 64*(_size) _align +.endm + +#endif + + +######################################################################## +#### Define SHA256 Out Of Order Data Structures +######################################################################## + +START_FIELDS # LANE_DATA +### name size align +FIELD _job_in_lane, 8, 8 # pointer to job object +END_FIELDS + + _LANE_DATA_size = _FIELD_OFFSET + _LANE_DATA_align = _STRUCT_ALIGN + +######################################################################## + +START_FIELDS # SHA256_ARGS_X4 +### name size align +FIELD _digest, 4*8*8, 4 # transposed digest +FIELD _data_ptr, 8*8, 8 # array of pointers to data +END_FIELDS + + _SHA256_ARGS_X4_size = _FIELD_OFFSET + _SHA256_ARGS_X4_align = _STRUCT_ALIGN + _SHA256_ARGS_X8_size = _FIELD_OFFSET + _SHA256_ARGS_X8_align = _STRUCT_ALIGN + +####################################################################### + +START_FIELDS # MB_MGR +### name size align +FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align +FIELD _lens, 4*8, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align +END_FIELDS + + _MB_MGR_size = _FIELD_OFFSET + _MB_MGR_align = _STRUCT_ALIGN + +_args_digest = _args + _digest +_args_data_ptr = _args + _data_ptr + +####################################################################### + +START_FIELDS #STACK_FRAME +### name size align +FIELD _data, 16*SZ8, 1 # transposed digest +FIELD _digest, 8*SZ8, 1 # array of pointers to data +FIELD _ytmp, 4*SZ8, 1 +FIELD _rsp, 8, 1 +END_FIELDS + + _STACK_FRAME_size = _FIELD_OFFSET + _STACK_FRAME_align = _STRUCT_ALIGN + +####################################################################### + +######################################################################## +#### Define constants +######################################################################## + +#define STS_UNKNOWN 0 +#define STS_BEING_PROCESSED 1 +#define STS_COMPLETED 2 + +######################################################################## +#### Define JOB_SHA256 structure +######################################################################## + +START_FIELDS # JOB_SHA256 + +### name size align +FIELD _buffer, 8, 8 # pointer to buffer +FIELD _len, 8, 8 # length in bytes +FIELD _result_digest, 8*4, 32 # Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + + _JOB_SHA256_size = _FIELD_OFFSET + _JOB_SHA256_align = _STRUCT_ALIGN diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S new file mode 100644 index 000000000000..b691da981cd9 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -0,0 +1,304 @@ +/* + * Flush routine for SHA256 multibuffer + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha256_mb_mgr_datastruct.S" + +.extern sha256_x8_avx2 + +#LINUX register definitions +#define arg1 %rdi +#define arg2 %rsi + +# Common register definitions +#define state arg1 +#define job arg2 +#define len2 arg2 + +# idx must be a register not clobberred by sha1_mult +#define idx %r8 +#define DWORD_idx %r8d + +#define unused_lanes %rbx +#define lane_data %rbx +#define tmp2 %rbx +#define tmp2_w %ebx + +#define job_rax %rax +#define tmp1 %rax +#define size_offset %rax +#define tmp %rax +#define start_offset %rax + +#define tmp3 %arg1 + +#define extra_blocks %arg2 +#define p %arg2 + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JNE_SKIP i +jne skip_\i +.endm + +.altmacro +.macro SET_OFFSET _offset +offset = \_offset +.endm +.noaltmacro + +# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state) +# arg 1 : rcx : state +ENTRY(sha256_mb_mgr_flush_avx2) + FRAME_BEGIN + push %rbx + + # If bit (32+3) is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $32+3, unused_lanes + jc return_null + + # find a lane with a non-null job + xor idx, idx + offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne one(%rip), idx + offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne two(%rip), idx + offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne three(%rip), idx + offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne four(%rip), idx + offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne five(%rip), idx + offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne six(%rip), idx + offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne seven(%rip), idx + + # copy idx to empty lanes +copy_lane_data: + offset = (_args + _data_ptr) + mov offset(state,idx,8), tmp + + I = 0 +.rep 8 + offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) +.altmacro + JNE_SKIP %I + offset = (_args + _data_ptr + 8*I) + mov tmp, offset(state) + offset = (_lens + 4*I) + movl $0xFFFFFFFF, offset(state) +LABEL skip_ %I + I = (I+1) +.noaltmacro +.endr + + # Find min length + vmovdqa _lens+0*16(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens+0*16(state) + vmovdqa %xmm1, _lens+1*16(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha256_x8_avx2 + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + + mov unused_lanes, _unused_lanes(state) + movl $0xFFFFFFFF, _lens(state,idx,4) + + vmovd _args_digest(state , idx, 4) , %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + vmovd _args_digest+4*32(state, idx, 4), %xmm1 + vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 + + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + vmovdqu %xmm1, offset(job_rax) + +return: + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha256_mb_mgr_flush_avx2) + +############################################################################## + +.align 16 +ENTRY(sha256_mb_mgr_get_comp_job_avx2) + push %rbx + + ## if bit 32+3 is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $(32+3), unused_lanes + jc .return_null + + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword + + vmovd %xmm2, DWORD_idx + test $~0xF, idx + jnz .return_null + + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), tmp2_w + vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 + vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 + + vmovdqu %xmm0, _result_digest(job_rax) + movl tmp2_w, _result_digest+1*16(job_rax) + + pop %rbx + + ret + +.return_null: + xor job_rax, job_rax + pop %rbx + ret +ENDPROC(sha256_mb_mgr_get_comp_job_avx2) + +.data + +.align 16 +clear_low_nibble: +.octa 0x000000000000000000000000FFFFFFF0 +one: +.quad 1 +two: +.quad 2 +three: +.quad 3 +four: +.quad 4 +five: +.quad 5 +six: +.quad 6 +seven: +.quad 7 diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c new file mode 100644 index 000000000000..b0c498371e67 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c @@ -0,0 +1,65 @@ +/* + * Initialization code for multi buffer SHA256 algorithm for AVX2 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sha256_mb_mgr.h" + +void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state) +{ + unsigned int j; + + state->unused_lanes = 0xF76543210ULL; + for (j = 0; j < 8; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = NULL; + } +} diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S new file mode 100644 index 000000000000..7ea670e25acc --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S @@ -0,0 +1,215 @@ +/* + * Buffer submit code for multi buffer SHA256 algorithm + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha256_mb_mgr_datastruct.S" + +.extern sha256_x8_avx2 + +# LINUX register definitions +arg1 = %rdi +arg2 = %rsi +size_offset = %rcx +tmp2 = %rcx +extra_blocks = %rdx + +# Common definitions +#define state arg1 +#define job %rsi +#define len2 arg2 +#define p2 arg2 + +# idx must be a register not clobberred by sha1_x8_avx2 +idx = %r8 +DWORD_idx = %r8d +last_len = %r8 + +p = %r11 +start_offset = %r11 + +unused_lanes = %rbx +BYTE_unused_lanes = %bl + +job_rax = %rax +len = %rax +DWORD_len = %eax + +lane = %r12 +tmp3 = %r12 + +tmp = %r9 +DWORD_tmp = %r9d + +lane_data = %r10 + +# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job) +# arg 1 : rcx : state +# arg 2 : rdx : job +ENTRY(sha256_mb_mgr_submit_avx2) + FRAME_BEGIN + push %rbx + push %r12 + + mov _unused_lanes(state), unused_lanes + mov unused_lanes, lane + and $0xF, lane + shr $4, unused_lanes + imul $_LANE_DATA_size, lane, lane_data + movl $STS_BEING_PROCESSED, _status(job) + lea _ldata(state, lane_data), lane_data + mov unused_lanes, _unused_lanes(state) + movl _len(job), DWORD_len + + mov job, _job_in_lane(lane_data) + shl $4, len + or lane, len + + movl DWORD_len, _lens(state , lane, 4) + + # Load digest words from result_digest + vmovdqu _result_digest(job), %xmm0 + vmovdqu _result_digest+1*16(job), %xmm1 + vmovd %xmm0, _args_digest(state, lane, 4) + vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4) + vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4) + vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4) + vmovd %xmm1, _args_digest+4*32(state , lane, 4) + + vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4) + vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4) + vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4) + + mov _buffer(job), p + mov p, _args_data_ptr(state, lane, 8) + + cmp $0xF, unused_lanes + jne return_null + +start_loop: + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens + 0*16(state) + vmovdqa %xmm1, _lens + 1*16(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha256_x8_avx2 + + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + mov _unused_lanes(state), unused_lanes + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state,idx,4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0 + vmovd _args_digest+4*32(state, idx, 4), %xmm1 + + vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1 + vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1 + vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1 + + vmovdqu %xmm0, _result_digest(job_rax) + vmovdqu %xmm1, _result_digest+1*16(job_rax) + +return: + pop %r12 + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return + +ENDPROC(sha256_mb_mgr_submit_avx2) + +.data + +.align 16 +clear_low_nibble: + .octa 0x000000000000000000000000FFFFFFF0 diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S new file mode 100644 index 000000000000..aa21aea4c722 --- /dev/null +++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S @@ -0,0 +1,593 @@ +/* + * Multi-buffer SHA256 algorithm hash compute routine + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include "sha256_mb_mgr_datastruct.S" + +## code to compute oct SHA256 using SSE-256 +## outer calling routine takes care of save and restore of XMM registers +## Logic designed/laid out by JDG + +## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15 +## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +## Linux preserves: rdi rbp r8 +## +## clobbers %ymm0-15 + +arg1 = %rdi +arg2 = %rsi +reg3 = %rcx +reg4 = %rdx + +# Common definitions +STATE = arg1 +INP_SIZE = arg2 + +IDX = %rax +ROUND = %rbx +TBL = reg3 + +inp0 = %r9 +inp1 = %r10 +inp2 = %r11 +inp3 = %r12 +inp4 = %r13 +inp5 = %r14 +inp6 = %r15 +inp7 = reg4 + +a = %ymm0 +b = %ymm1 +c = %ymm2 +d = %ymm3 +e = %ymm4 +f = %ymm5 +g = %ymm6 +h = %ymm7 + +T1 = %ymm8 + +a0 = %ymm12 +a1 = %ymm13 +a2 = %ymm14 +TMP = %ymm15 +TMP0 = %ymm6 +TMP1 = %ymm7 + +TT0 = %ymm8 +TT1 = %ymm9 +TT2 = %ymm10 +TT3 = %ymm11 +TT4 = %ymm12 +TT5 = %ymm13 +TT6 = %ymm14 +TT7 = %ymm15 + +# Define stack usage + +# Assume stack aligned to 32 bytes before call +# Therefore FRAMESZ mod 32 must be 32-8 = 24 + +#define FRAMESZ 0x388 + +#define VMOVPS vmovups + +# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +# "transpose" data in {r0...r7} using temps {t0...t1} +# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +# r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +# r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +# r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +# r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +# +# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +# r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +# r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +# r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +# r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +# r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +# r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +# r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +# + +.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 + # process top half (r0..r3) {a...d} + vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + # use r2 in place of t0 + # process bottom half (r4..r7) {e...h} + vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 + vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 + vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 + vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 + vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 + vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 + vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 + vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 + +.endm + +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro _PRORD reg imm tmp + vpslld $(32-\imm),\reg,\tmp + vpsrld $\imm,\reg, \reg + vpor \tmp,\reg, \reg +.endm + +# PRORD_nd reg, imm, tmp, src +.macro _PRORD_nd reg imm tmp src + vpslld $(32-\imm), \src, \tmp + vpsrld $\imm, \src, \reg + vpor \tmp, \reg, \reg +.endm + +# PRORD dst/src, amt +.macro PRORD reg imm + _PRORD \reg,\imm,TMP +.endm + +# PRORD_nd dst, src, amt +.macro PRORD_nd reg tmp imm + _PRORD_nd \reg, \imm, TMP, \tmp +.endm + +# arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_00_15 _T1 i + PRORD_nd a0,e,5 # sig1: a0 = (e >> 5) + + vpxor g, f, a2 # ch: a2 = f^g + vpand e,a2, a2 # ch: a2 = (f^g)&e + vpxor g, a2, a2 # a2 = ch + + PRORD_nd a1,e,25 # sig1: a1 = (e >> 25) + + vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp) + vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K + vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd a2, h, h # h = h + ch + PRORD_nd a2,a,11 # sig0: a2 = (a >> 11) + vpaddd \_T1,h, h # h = h + ch + W + K + vpxor a1, a0, a0 # a0 = sigma1 + PRORD_nd a1,a,22 # sig0: a1 = (a >> 22) + vpxor c, a, \_T1 # maj: T1 = a^c + add $SZ8, ROUND # ROUND++ + vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b + vpaddd a0, h, h + vpaddd h, d, d + vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11) + PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a1, a2, a2 # a2 = sig0 + vpand c, a, a1 # maj: a1 = a&c + vpor \_T1, a1, a1 # a1 = maj + vpaddd a1, h, h # h = h + ch + W + K + maj + vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS +.endm + +# arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_16_XX _T1 i + vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1 + vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1 + vmovdqu \_T1, a0 + PRORD \_T1,11 + vmovdqu a1, a2 + PRORD a1,2 + vpxor a0, \_T1, \_T1 + PRORD \_T1, 7 + vpxor a2, a1, a1 + PRORD a1, 17 + vpsrld $3, a0, a0 + vpxor a0, \_T1, \_T1 + vpsrld $10, a2, a2 + vpxor a2, a1, a1 + vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1 + vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1 + vpaddd a1, \_T1, \_T1 + + ROUND_00_15 \_T1,\i +.endm + +# SHA256_ARGS: +# UINT128 digest[8]; // transposed digests +# UINT8 *data_ptr[4]; + +# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes); +# arg 1 : STATE : pointer to array of pointers to input data +# arg 2 : INP_SIZE : size of input in blocks + # general registers preserved in outer calling routine + # outer calling routine saves all the XMM registers + # save rsp, allocate 32-byte aligned for local variables +ENTRY(sha256_x8_avx2) + + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, IDX + sub $FRAMESZ, %rsp + and $~0x1F, %rsp + mov IDX, _rsp(%rsp) + + # Load the pre-transposed incoming digest. + vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a + vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b + vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c + vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d + vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e + vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f + vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g + vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h + + lea K256_8(%rip),TBL + + # load the address of each of the 4 message lanes + # getting ready to transpose input onto stack + mov _args_data_ptr+0*PTR_SZ(STATE),inp0 + mov _args_data_ptr+1*PTR_SZ(STATE),inp1 + mov _args_data_ptr+2*PTR_SZ(STATE),inp2 + mov _args_data_ptr+3*PTR_SZ(STATE),inp3 + mov _args_data_ptr+4*PTR_SZ(STATE),inp4 + mov _args_data_ptr+5*PTR_SZ(STATE),inp5 + mov _args_data_ptr+6*PTR_SZ(STATE),inp6 + mov _args_data_ptr+7*PTR_SZ(STATE),inp7 + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + # save old digest + vmovdqu a, _digest(%rsp) + vmovdqu b, _digest+1*SZ8(%rsp) + vmovdqu c, _digest+2*SZ8(%rsp) + vmovdqu d, _digest+3*SZ8(%rsp) + vmovdqu e, _digest+4*SZ8(%rsp) + vmovdqu f, _digest+5*SZ8(%rsp) + vmovdqu g, _digest+6*SZ8(%rsp) + vmovdqu h, _digest+7*SZ8(%rsp) + i = 0 +.rep 2 + VMOVPS i*32(inp0, IDX), TT0 + VMOVPS i*32(inp1, IDX), TT1 + VMOVPS i*32(inp2, IDX), TT2 + VMOVPS i*32(inp3, IDX), TT3 + VMOVPS i*32(inp4, IDX), TT4 + VMOVPS i*32(inp5, IDX), TT5 + VMOVPS i*32(inp6, IDX), TT6 + VMOVPS i*32(inp7, IDX), TT7 + vmovdqu g, _ytmp(%rsp) + vmovdqu h, _ytmp+1*SZ8(%rsp) + TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1 + vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1 + vmovdqu _ytmp(%rsp), g + vpshufb TMP1, TT0, TT0 + vpshufb TMP1, TT1, TT1 + vpshufb TMP1, TT2, TT2 + vpshufb TMP1, TT3, TT3 + vpshufb TMP1, TT4, TT4 + vpshufb TMP1, TT5, TT5 + vpshufb TMP1, TT6, TT6 + vpshufb TMP1, TT7, TT7 + vmovdqu _ytmp+1*SZ8(%rsp), h + vmovdqu TT4, _ytmp(%rsp) + vmovdqu TT5, _ytmp+1*SZ8(%rsp) + vmovdqu TT6, _ytmp+2*SZ8(%rsp) + vmovdqu TT7, _ytmp+3*SZ8(%rsp) + ROUND_00_15 TT0,(i*8+0) + vmovdqu _ytmp(%rsp), TT0 + ROUND_00_15 TT1,(i*8+1) + vmovdqu _ytmp+1*SZ8(%rsp), TT1 + ROUND_00_15 TT2,(i*8+2) + vmovdqu _ytmp+2*SZ8(%rsp), TT2 + ROUND_00_15 TT3,(i*8+3) + vmovdqu _ytmp+3*SZ8(%rsp), TT3 + ROUND_00_15 TT0,(i*8+4) + ROUND_00_15 TT1,(i*8+5) + ROUND_00_15 TT2,(i*8+6) + ROUND_00_15 TT3,(i*8+7) + i = (i+1) +.endr + add $64, IDX + i = (i*8) + + jmp Lrounds_16_xx +.align 16 +Lrounds_16_xx: +.rep 16 + ROUND_16_XX T1, i + i = (i+1) +.endr + + cmp $ROUNDS,ROUND + jb Lrounds_16_xx + + # add old digest + vpaddd _digest+0*SZ8(%rsp), a, a + vpaddd _digest+1*SZ8(%rsp), b, b + vpaddd _digest+2*SZ8(%rsp), c, c + vpaddd _digest+3*SZ8(%rsp), d, d + vpaddd _digest+4*SZ8(%rsp), e, e + vpaddd _digest+5*SZ8(%rsp), f, f + vpaddd _digest+6*SZ8(%rsp), g, g + vpaddd _digest+7*SZ8(%rsp), h, h + + sub $1, INP_SIZE # unit is blocks + jne lloop + + # write back to memory (state object) the transposed digest + vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE) + vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE) + + # update input pointers + add IDX, inp0 + mov inp0, _args_data_ptr+0*8(STATE) + add IDX, inp1 + mov inp1, _args_data_ptr+1*8(STATE) + add IDX, inp2 + mov inp2, _args_data_ptr+2*8(STATE) + add IDX, inp3 + mov inp3, _args_data_ptr+3*8(STATE) + add IDX, inp4 + mov inp4, _args_data_ptr+4*8(STATE) + add IDX, inp5 + mov inp5, _args_data_ptr+5*8(STATE) + add IDX, inp6 + mov inp6, _args_data_ptr+6*8(STATE) + add IDX, inp7 + mov inp7, _args_data_ptr+7*8(STATE) + + # Postamble + mov _rsp(%rsp), %rsp + + # restore callee-saved clobbered registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + ret +ENDPROC(sha256_x8_avx2) +.data +.align 64 +K256_8: + .octa 0x428a2f98428a2f98428a2f98428a2f98 + .octa 0x428a2f98428a2f98428a2f98428a2f98 + .octa 0x71374491713744917137449171374491 + .octa 0x71374491713744917137449171374491 + .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf + .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf + .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5 + .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5 + .octa 0x3956c25b3956c25b3956c25b3956c25b + .octa 0x3956c25b3956c25b3956c25b3956c25b + .octa 0x59f111f159f111f159f111f159f111f1 + .octa 0x59f111f159f111f159f111f159f111f1 + .octa 0x923f82a4923f82a4923f82a4923f82a4 + .octa 0x923f82a4923f82a4923f82a4923f82a4 + .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5 + .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5 + .octa 0xd807aa98d807aa98d807aa98d807aa98 + .octa 0xd807aa98d807aa98d807aa98d807aa98 + .octa 0x12835b0112835b0112835b0112835b01 + .octa 0x12835b0112835b0112835b0112835b01 + .octa 0x243185be243185be243185be243185be + .octa 0x243185be243185be243185be243185be + .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3 + .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3 + .octa 0x72be5d7472be5d7472be5d7472be5d74 + .octa 0x72be5d7472be5d7472be5d7472be5d74 + .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe + .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe + .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7 + .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7 + .octa 0xc19bf174c19bf174c19bf174c19bf174 + .octa 0xc19bf174c19bf174c19bf174c19bf174 + .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1 + .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1 + .octa 0xefbe4786efbe4786efbe4786efbe4786 + .octa 0xefbe4786efbe4786efbe4786efbe4786 + .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6 + .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6 + .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc + .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc + .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f + .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f + .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa + .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa + .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc + .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc + .octa 0x76f988da76f988da76f988da76f988da + .octa 0x76f988da76f988da76f988da76f988da + .octa 0x983e5152983e5152983e5152983e5152 + .octa 0x983e5152983e5152983e5152983e5152 + .octa 0xa831c66da831c66da831c66da831c66d + .octa 0xa831c66da831c66da831c66da831c66d + .octa 0xb00327c8b00327c8b00327c8b00327c8 + .octa 0xb00327c8b00327c8b00327c8b00327c8 + .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7 + .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7 + .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3 + .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3 + .octa 0xd5a79147d5a79147d5a79147d5a79147 + .octa 0xd5a79147d5a79147d5a79147d5a79147 + .octa 0x06ca635106ca635106ca635106ca6351 + .octa 0x06ca635106ca635106ca635106ca6351 + .octa 0x14292967142929671429296714292967 + .octa 0x14292967142929671429296714292967 + .octa 0x27b70a8527b70a8527b70a8527b70a85 + .octa 0x27b70a8527b70a8527b70a8527b70a85 + .octa 0x2e1b21382e1b21382e1b21382e1b2138 + .octa 0x2e1b21382e1b21382e1b21382e1b2138 + .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc + .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc + .octa 0x53380d1353380d1353380d1353380d13 + .octa 0x53380d1353380d1353380d1353380d13 + .octa 0x650a7354650a7354650a7354650a7354 + .octa 0x650a7354650a7354650a7354650a7354 + .octa 0x766a0abb766a0abb766a0abb766a0abb + .octa 0x766a0abb766a0abb766a0abb766a0abb + .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e + .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e + .octa 0x92722c8592722c8592722c8592722c85 + .octa 0x92722c8592722c8592722c8592722c85 + .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1 + .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1 + .octa 0xa81a664ba81a664ba81a664ba81a664b + .octa 0xa81a664ba81a664ba81a664ba81a664b + .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70 + .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70 + .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3 + .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3 + .octa 0xd192e819d192e819d192e819d192e819 + .octa 0xd192e819d192e819d192e819d192e819 + .octa 0xd6990624d6990624d6990624d6990624 + .octa 0xd6990624d6990624d6990624d6990624 + .octa 0xf40e3585f40e3585f40e3585f40e3585 + .octa 0xf40e3585f40e3585f40e3585f40e3585 + .octa 0x106aa070106aa070106aa070106aa070 + .octa 0x106aa070106aa070106aa070106aa070 + .octa 0x19a4c11619a4c11619a4c11619a4c116 + .octa 0x19a4c11619a4c11619a4c11619a4c116 + .octa 0x1e376c081e376c081e376c081e376c08 + .octa 0x1e376c081e376c081e376c081e376c08 + .octa 0x2748774c2748774c2748774c2748774c + .octa 0x2748774c2748774c2748774c2748774c + .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5 + .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5 + .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3 + .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3 + .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a + .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a + .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f + .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f + .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3 + .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3 + .octa 0x748f82ee748f82ee748f82ee748f82ee + .octa 0x748f82ee748f82ee748f82ee748f82ee + .octa 0x78a5636f78a5636f78a5636f78a5636f + .octa 0x78a5636f78a5636f78a5636f78a5636f + .octa 0x84c8781484c8781484c8781484c87814 + .octa 0x84c8781484c8781484c8781484c87814 + .octa 0x8cc702088cc702088cc702088cc70208 + .octa 0x8cc702088cc702088cc702088cc70208 + .octa 0x90befffa90befffa90befffa90befffa + .octa 0x90befffa90befffa90befffa90befffa + .octa 0xa4506ceba4506ceba4506ceba4506ceb + .octa 0xa4506ceba4506ceba4506ceba4506ceb + .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 + .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 + .octa 0xc67178f2c67178f2c67178f2c67178f2 + .octa 0xc67178f2c67178f2c67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: +.octa 0x0c0d0e0f08090a0b0405060700010203 +.octa 0x0c0d0e0f08090a0b0405060700010203 + +.align 64 +.global K256 +K256: + .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 3ae0f43ebd37..9e79baf03a4b 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -427,4 +427,14 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha256-ssse3"); +MODULE_ALIAS_CRYPTO("sha256-avx"); +MODULE_ALIAS_CRYPTO("sha256-avx2"); MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha224-ssse3"); +MODULE_ALIAS_CRYPTO("sha224-avx"); +MODULE_ALIAS_CRYPTO("sha224-avx2"); +#ifdef CONFIG_AS_SHA256_NI +MODULE_ALIAS_CRYPTO("sha256-ni"); +MODULE_ALIAS_CRYPTO("sha224-ni"); +#endif diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile new file mode 100644 index 000000000000..0a57e2103980 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/Makefile @@ -0,0 +1,11 @@ +# +# Arch-specific CryptoAPI modules. +# + +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o + sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \ + sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o +endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c new file mode 100644 index 000000000000..f4cf5b78fd36 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -0,0 +1,1046 @@ +/* + * Multi buffer SHA512 algorithm Glue Code + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/list.h> +#include <crypto/scatterwalk.h> +#include <crypto/sha.h> +#include <crypto/mcryptd.h> +#include <crypto/crypto_wq.h> +#include <asm/byteorder.h> +#include <linux/hardirq.h> +#include <asm/fpu/api.h> +#include "sha512_mb_ctx.h" + +#define FLUSH_INTERVAL 1000 /* in usec */ + +static struct mcryptd_alg_state sha512_mb_alg_state; + +struct sha512_mb_ctx { + struct mcryptd_ahash *mcryptd_tfm; +}; + +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha512_hash_ctx *hash_ctx) +{ + struct ahash_request *areq; + + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); +} + +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +{ + return container_of((void *) ctx, struct ahash_request, __ctx); +} + +static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, + struct ahash_request *areq) +{ + rctx->flag = HASH_UPDATE; +} + +static asmlinkage void (*sha512_job_mgr_init)(struct sha512_mb_mgr *state); +static asmlinkage struct job_sha512* (*sha512_job_mgr_submit) + (struct sha512_mb_mgr *state, + struct job_sha512 *job); +static asmlinkage struct job_sha512* (*sha512_job_mgr_flush) + (struct sha512_mb_mgr *state); +static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job) + (struct sha512_mb_mgr *state); + +inline void sha512_init_digest(uint64_t *digest) +{ + static const uint64_t initial_digest[SHA512_DIGEST_LENGTH] = { + SHA512_H0, SHA512_H1, SHA512_H2, + SHA512_H3, SHA512_H4, SHA512_H5, + SHA512_H6, SHA512_H7 }; + memcpy(digest, initial_digest, sizeof(initial_digest)); +} + +inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], + uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memset(&padblock[i], 0, SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((SHA512_BLOCK_SIZE - 1) & + (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) &padblock[i - 16]) = 0; +#endif + + *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3); + + /* Number of extra blocks to hash */ + return i >> SHA512_LOG2_BLOCK_SIZE; +} + +static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit + (struct sha512_ctx_mgr *mgr, struct sha512_hash_ctx *ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Clear PROCESSING bit */ + ctx->status = HASH_CTX_STS_COMPLETE; + return ctx; + } + + /* + * If the extra blocks are empty, begin hashing what remains + * in the user's buffer. + */ + if (ctx->partial_block_buffer_length == 0 && + ctx->incoming_buffer_length) { + + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + uint32_t copy_len; + + /* + * Only entire blocks can be hashed. + * Copy remainder to extra blocks buffer. + */ + copy_len = len & (SHA512_BLOCK_SIZE-1); + + if (copy_len) { + len -= copy_len; + memcpy(ctx->partial_block_buffer, + ((const char *) buffer + len), + copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + /* len should be a multiple of the block size now */ + assert((len % SHA512_BLOCK_SIZE) == 0); + + /* Set len to the number of blocks to be hashed */ + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_submit(&mgr->mgr, + &ctx->job); + continue; + } + } + + /* + * If the extra blocks are not empty, then we are + * either on the last block(s) or we need more + * user input before continuing. + */ + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = + sha512_pad(buf, ctx->total_length); + + ctx->status = (HASH_CTX_STS_PROCESSING | + HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static struct sha512_hash_ctx + *sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr) +{ + /* + * If get_comp_job returns NULL, there are no jobs complete. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. + * If it is not ready, resubmit the job to finish processing. + * If sha512_ctx_mgr_resubmit returned a job, it is ready to be + * returned. + * Otherwise, all jobs currently being managed by the hash_ctx_mgr + * still need processing. + */ + struct sha512_hash_ctx *ctx; + + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_get_comp_job(&mgr->mgr); + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr) +{ + sha512_job_mgr_init(&mgr->mgr); +} + +static struct sha512_hash_ctx + *sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr, + struct sha512_hash_ctx *ctx, + const void *buffer, + uint32_t len, + int flags) +{ + if (flags & (~HASH_ENTIRE)) { + /* + * User should not pass anything other than FIRST, UPDATE, or + * LAST + */ + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + /* Cannot submit to a currently processing job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + /* Cannot update a finished job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + + if (flags & HASH_FIRST) { + /* Init digest */ + sha512_init_digest(ctx->job.result_digest); + + /* Reset byte counter */ + ctx->total_length = 0; + + /* Clear extra blocks */ + ctx->partial_block_buffer_length = 0; + } + + /* + * If we made it here, there were no errors during this call to + * submit + */ + ctx->error = HASH_CTX_ERROR_NONE; + + /* Store buffer ptr info from user */ + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + /* + * Store the user's request flags and mark this ctx as currently being + * processed. + */ + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + /* Advance byte counter */ + ctx->total_length += len; + + /* + * If there is anything currently buffered in the extra blocks, + * append to it until it contains a whole block. + * Or if the user's buffer contains less than a whole block, + * append as much as possible to the extra block. + */ + if (ctx->partial_block_buffer_length || len < SHA512_BLOCK_SIZE) { + /* Compute how many bytes to copy from user buffer into extra + * block + */ + uint32_t copy_len = SHA512_BLOCK_SIZE - + ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + /* Copy and update relevant pointers and counters */ + memcpy + (&ctx->partial_block_buffer[ctx->partial_block_buffer_length], + buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + + /* The extra block should never contain more than 1 block + * here + */ + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + /* If the extra block buffer contains exactly 1 block, it can + * be hashed. + */ + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_submit(&mgr->mgr, &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr) +{ + struct sha512_hash_ctx *ctx; + + while (1) { + ctx = (struct sha512_hash_ctx *) + sha512_job_mgr_flush(&mgr->mgr); + + /* If flush returned 0, there are no more jobs in flight. */ + if (!ctx) + return NULL; + + /* + * If flush returned a job, resubmit the job to finish + * processing. + */ + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + /* + * If sha512_ctx_mgr_resubmit returned a job, it is ready to + * be returned. Otherwise, all jobs currently being managed by + * the sha512_ctx_mgr still need processing. Loop. + */ + if (ctx) + return ctx; + } +} + +static int sha512_mb_init(struct ahash_request *areq) +{ + struct sha512_hash_ctx *sctx = ahash_request_ctx(areq); + + hash_ctx_init(sctx); + sctx->job.result_digest[0] = SHA512_H0; + sctx->job.result_digest[1] = SHA512_H1; + sctx->job.result_digest[2] = SHA512_H2; + sctx->job.result_digest[3] = SHA512_H3; + sctx->job.result_digest[4] = SHA512_H4; + sctx->job.result_digest[5] = SHA512_H5; + sctx->job.result_digest[6] = SHA512_H6; + sctx->job.result_digest[7] = SHA512_H7; + sctx->total_length = 0; + sctx->partial_block_buffer_length = 0; + sctx->status = HASH_CTX_STS_IDLE; + + return 0; +} + +static int sha512_mb_set_results(struct mcryptd_hash_request_ctx *rctx) +{ + int i; + struct sha512_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); + __be64 *dst = (__be64 *) rctx->out; + + for (i = 0; i < 8; ++i) + dst[i] = cpu_to_be64(sctx->job.result_digest[i]); + + return 0; +} + +static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, + struct mcryptd_alg_cstate *cstate, bool flush) +{ + int flag = HASH_UPDATE; + int nbytes, err = 0; + struct mcryptd_hash_request_ctx *rctx = *ret_rctx; + struct sha512_hash_ctx *sha_ctx; + + /* more work ? */ + while (!(rctx->flag & HASH_DONE)) { + nbytes = crypto_ahash_walk_done(&rctx->walk, 0); + if (nbytes < 0) { + err = nbytes; + goto out; + } + /* check if the walk is done */ + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + if (rctx->flag & HASH_FINAL) + flag |= HASH_LAST; + + } + sha_ctx = (struct sha512_hash_ctx *) + ahash_request_ctx(&rctx->areq); + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); + if (!sha_ctx) { + if (flush) + sha_ctx = sha512_ctx_mgr_flush(cstate->mgr); + } + kernel_fpu_end(); + if (sha_ctx) + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + else { + rctx = NULL; + goto out; + } + } + + /* copy the results */ + if (rctx->flag & HASH_FINAL) + sha512_mb_set_results(rctx); + +out: + *ret_rctx = rctx; + return err; +} + +static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate, + int err) +{ + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha512_hash_ctx *sha_ctx; + struct mcryptd_hash_request_ctx *req_ctx; + int ret; + + /* remove from work list */ + spin_lock(&cstate->work_lock); + list_del(&rctx->waiter); + spin_unlock(&cstate->work_lock); + + if (irqs_disabled()) + rctx->complete(&req->base, err); + else { + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + } + + /* check to see if there are other jobs that are done */ + sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr); + while (sha_ctx) { + req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&req_ctx, cstate, false); + if (req_ctx) { + spin_lock(&cstate->work_lock); + list_del(&req_ctx->waiter); + spin_unlock(&cstate->work_lock); + + req = cast_mcryptd_ctx_to_req(req_ctx); + if (irqs_disabled()) + rctx->complete(&req->base, ret); + else { + local_bh_disable(); + rctx->complete(&req->base, ret); + local_bh_enable(); + } + } + sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr); + } + + return 0; +} + +static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate) +{ + unsigned long next_flush; + unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + + /* initialize tag */ + rctx->tag.arrival = jiffies; /* tag the arrival time */ + rctx->tag.seq_num = cstate->next_seq_num++; + next_flush = rctx->tag.arrival + delay; + rctx->tag.expire = next_flush; + + spin_lock(&cstate->work_lock); + list_add_tail(&rctx->waiter, &cstate->work_list); + spin_unlock(&cstate->work_lock); + + mcryptd_arm_flusher(cstate, delay); +} + +static int sha512_mb_update(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha512_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha512_hash_ctx *sha_ctx; + int ret = 0, nbytes; + + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) + rctx->flag |= HASH_DONE; + + /* submit */ + sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); + sha512_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha512_mb_finup(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha512_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha512_hash_ctx *sha_ctx; + int ret = 0, flag = HASH_UPDATE, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + flag = HASH_LAST; + } + + /* submit */ + rctx->flag |= HASH_FINAL; + sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); + sha512_mb_add_list(rctx, cstate); + + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha512_mb_final(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, + areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha512_mb_alg_state.alg_cstate); + + struct sha512_hash_ctx *sha_ctx; + int ret = 0; + u8 data; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + rctx->flag |= HASH_DONE | HASH_FINAL; + + sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); + /* flag HASH_FINAL and 0 data size */ + sha512_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha512_mb_export(struct ahash_request *areq, void *out) +{ + struct sha512_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha512_mb_import(struct ahash_request *areq, const void *in) +{ + struct sha512_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha512_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; + + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha512-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha512_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha512_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha512_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha512_hash_ctx)); + + return 0; +} + +static void sha512_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha512_mb_areq_alg = { + .init = sha512_mb_init, + .update = sha512_mb_update, + .final = sha512_mb_final, + .finup = sha512_mb_finup, + .export = sha512_mb_export, + .import = sha512_mb_import, + .halg = { + .digestsize = SHA512_DIGEST_SIZE, + .statesize = sizeof(struct sha512_hash_ctx), + .base = { + .cra_name = "__sha512-mb", + .cra_driver_name = "__intel_sha512-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha512_mb_areq_alg.halg.base.cra_list), + .cra_init = sha512_mb_areq_init_tfm, + .cra_exit = sha512_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha512_hash_ctx), + } + } +}; + +static int sha512_mb_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_init(mcryptd_req); +} + +static int sha512_mb_async_update(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_update(mcryptd_req); +} + +static int sha512_mb_async_finup(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_finup(mcryptd_req); +} + +static int sha512_mb_async_final(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_final(mcryptd_req); +} + +static int sha512_mb_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_digest(mcryptd_req); +} + +static int sha512_mb_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_export(mcryptd_req, out); +} + +static int sha512_mb_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); + struct mcryptd_hash_request_ctx *rctx; + struct ahash_request *areq; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + rctx = ahash_request_ctx(mcryptd_req); + + areq = &rctx->areq; + + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); + + return crypto_ahash_import(mcryptd_req, in); +} + +static struct ahash_alg sha512_mb_async_alg = { + .init = sha512_mb_async_init, + .update = sha512_mb_async_update, + .final = sha512_mb_async_final, + .finup = sha512_mb_async_finup, + .digest = sha512_mb_async_digest, + .export = sha512_mb_async_export, + .import = sha512_mb_async_import, + .halg = { + .digestsize = SHA512_DIGEST_SIZE, + .statesize = sizeof(struct sha512_hash_ctx), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512_mb", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | + CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha512_mb_async_alg.halg.base.cra_list), + .cra_init = sha512_mb_async_init_tfm, + .cra_exit = sha512_mb_async_exit_tfm, + .cra_ctxsize = sizeof(struct sha512_mb_ctx), + .cra_alignmask = 0, + }, + }, +}; + +static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate) +{ + struct mcryptd_hash_request_ctx *rctx; + unsigned long cur_time; + unsigned long next_flush = 0; + struct sha512_hash_ctx *sha_ctx; + + + cur_time = jiffies; + + while (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + if time_before(cur_time, rctx->tag.expire) + break; + kernel_fpu_begin(); + sha_ctx = (struct sha512_hash_ctx *) + sha512_ctx_mgr_flush(cstate->mgr); + kernel_fpu_end(); + if (!sha_ctx) { + pr_err("sha512_mb error: nothing got flushed for" + " non-empty list\n"); + break; + } + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + sha_finish_walk(&rctx, cstate, true); + sha_complete_job(rctx, cstate, 0); + } + + if (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + /* get the hash context and then flush time */ + next_flush = rctx->tag.expire; + mcryptd_arm_flusher(cstate, get_delay(next_flush)); + } + return next_flush; +} + +static int __init sha512_mb_mod_init(void) +{ + + int cpu; + int err; + struct mcryptd_alg_cstate *cpu_state; + + /* check for dependent cpu features */ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_BMI2)) + return -ENODEV; + + /* initialize multibuffer structures */ + sha512_mb_alg_state.alg_cstate = + alloc_percpu(struct mcryptd_alg_cstate); + + sha512_job_mgr_init = sha512_mb_mgr_init_avx2; + sha512_job_mgr_submit = sha512_mb_mgr_submit_avx2; + sha512_job_mgr_flush = sha512_mb_mgr_flush_avx2; + sha512_job_mgr_get_comp_job = sha512_mb_mgr_get_comp_job_avx2; + + if (!sha512_mb_alg_state.alg_cstate) + return -ENOMEM; + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu); + cpu_state->next_flush = 0; + cpu_state->next_seq_num = 0; + cpu_state->flusher_engaged = false; + INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); + cpu_state->cpu = cpu; + cpu_state->alg_state = &sha512_mb_alg_state; + cpu_state->mgr = kzalloc(sizeof(struct sha512_ctx_mgr), + GFP_KERNEL); + if (!cpu_state->mgr) + goto err2; + sha512_ctx_mgr_init(cpu_state->mgr); + INIT_LIST_HEAD(&cpu_state->work_list); + spin_lock_init(&cpu_state->work_lock); + } + sha512_mb_alg_state.flusher = &sha512_mb_flusher; + + err = crypto_register_ahash(&sha512_mb_areq_alg); + if (err) + goto err2; + err = crypto_register_ahash(&sha512_mb_async_alg); + if (err) + goto err1; + + + return 0; +err1: + crypto_unregister_ahash(&sha512_mb_areq_alg); +err2: + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha512_mb_alg_state.alg_cstate); + return -ENODEV; +} + +static void __exit sha512_mb_mod_fini(void) +{ + int cpu; + struct mcryptd_alg_cstate *cpu_state; + + crypto_unregister_ahash(&sha512_mb_async_alg); + crypto_unregister_ahash(&sha512_mb_areq_alg); + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha512_mb_alg_state.alg_cstate); +} + +module_init(sha512_mb_mod_init); +module_exit(sha512_mb_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, multi buffer accelerated"); + +MODULE_ALIAS("sha512"); diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h new file mode 100644 index 000000000000..9d4b2c8208d5 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h @@ -0,0 +1,130 @@ +/* + * Header file for multi buffer SHA512 context + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SHA_MB_CTX_INTERNAL_H +#define _SHA_MB_CTX_INTERNAL_H + +#include "sha512_mb_mgr.h" + +#define HASH_UPDATE 0x00 +#define HASH_FIRST 0x01 +#define HASH_LAST 0x02 +#define HASH_ENTIRE 0x03 +#define HASH_DONE 0x04 +#define HASH_FINAL 0x08 + +#define HASH_CTX_STS_IDLE 0x00 +#define HASH_CTX_STS_PROCESSING 0x01 +#define HASH_CTX_STS_LAST 0x02 +#define HASH_CTX_STS_COMPLETE 0x04 + +enum hash_ctx_error { + HASH_CTX_ERROR_NONE = 0, + HASH_CTX_ERROR_INVALID_FLAGS = -1, + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, +}; + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while (0) + +/* Hash Constants and Typedefs */ +#define SHA512_DIGEST_LENGTH 8 +#define SHA512_LOG2_BLOCK_SIZE 7 + +#define SHA512_PADLENGTHFIELD_SIZE 16 + +#ifdef SHA_MB_DEBUG +#define assert(expr) \ +do { \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) +#else +#define assert(expr) do {} while (0) +#endif + +struct sha512_ctx_mgr { + struct sha512_mb_mgr mgr; +}; + +/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */ + +struct sha512_hash_ctx { + /* Must be at struct offset 0 */ + struct job_sha512 job; + /* status flag */ + int status; + /* error flag */ + int error; + + uint32_t total_length; + const void *incoming_buffer; + uint32_t incoming_buffer_length; + uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; + uint32_t partial_block_buffer_length; + void *user_data; +}; + +#endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h new file mode 100644 index 000000000000..178f17eef382 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h @@ -0,0 +1,104 @@ +/* + * Header file for multi buffer SHA512 algorithm manager + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __SHA_MB_MGR_H +#define __SHA_MB_MGR_H + +#include <linux/types.h> + +#define NUM_SHA512_DIGEST_WORDS 8 + +enum job_sts {STS_UNKNOWN = 0, + STS_BEING_PROCESSED = 1, + STS_COMPLETED = 2, + STS_INTERNAL_ERROR = 3, + STS_ERROR = 4 +}; + +struct job_sha512 { + u8 *buffer; + u64 len; + u64 result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32); + enum job_sts status; + void *user_data; +}; + +struct sha512_args_x4 { + uint64_t digest[8][4]; + uint8_t *data_ptr[4]; +}; + +struct sha512_lane_data { + struct job_sha512 *job_in_lane; +}; + +struct sha512_mb_mgr { + struct sha512_args_x4 args; + + uint64_t lens[4]; + + /* each byte is index (0...7) of unused lanes */ + uint64_t unused_lanes; + /* byte 4 is set to FF as a flag */ + struct sha512_lane_data ldata[4]; +}; + +#define SHA512_MB_MGR_NUM_LANES_AVX2 4 + +void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state); +struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state, + struct job_sha512 *job); +struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state); +struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state); + +#endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S new file mode 100644 index 000000000000..cf2636d4c9ba --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S @@ -0,0 +1,281 @@ +/* + * Header file for multi buffer SHA256 algorithm data structure + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# Macros for defining data structures + +# Usage example + +#START_FIELDS # JOB_AES +### name size align +#FIELD _plaintext, 8, 8 # pointer to plaintext +#FIELD _ciphertext, 8, 8 # pointer to ciphertext +#FIELD _IV, 16, 8 # IV +#FIELD _keys, 8, 8 # pointer to keys +#FIELD _len, 4, 4 # length in bytes +#FIELD _status, 4, 4 # status enumeration +#FIELD _user_data, 8, 8 # pointer to user data +#UNION _union, size1, align1, \ +# size2, align2, \ +# size3, align3, \ +# ... +#END_FIELDS +#%assign _JOB_AES_size _FIELD_OFFSET +#%assign _JOB_AES_align _STRUCT_ALIGN + +######################################################################### + +# Alternate "struc-like" syntax: +# STRUCT job_aes2 +# RES_Q .plaintext, 1 +# RES_Q .ciphertext, 1 +# RES_DQ .IV, 1 +# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +# RES_U .union, size1, align1, \ +# size2, align2, \ +# ... +# ENDSTRUCT +# # Following only needed if nesting +# %assign job_aes2_size _FIELD_OFFSET +# %assign job_aes2_align _STRUCT_ALIGN +# +# RES_* macros take a name, a count and an optional alignment. +# The count in in terms of the base size of the macro, and the +# default alignment is the base size. +# The macros are: +# Macro Base size +# RES_B 1 +# RES_W 2 +# RES_D 4 +# RES_Q 8 +# RES_DQ 16 +# RES_Y 32 +# RES_Z 64 +# +# RES_U defines a union. It's arguments are a name and two or more +# pairs of "size, alignment" +# +# The two assigns are only needed if this structure is being nested +# within another. Even if the assigns are not done, one can still use +# STRUCT_NAME_size as the size of the structure. +# +# Note that for nesting, you still need to assign to STRUCT_NAME_size. +# +# The differences between this and using "struc" directly are that each +# type is implicitly aligned to its natural length (although this can be +# over-ridden with an explicit third parameter), and that the structure +# is padded at the end to its overall alignment. +# + +######################################################################### + +#ifndef _DATASTRUCT_ASM_ +#define _DATASTRUCT_ASM_ + +#define PTR_SZ 8 +#define SHA512_DIGEST_WORD_SIZE 8 +#define SHA512_MB_MGR_NUM_LANES_AVX2 4 +#define NUM_SHA512_DIGEST_WORDS 8 +#define SZ4 4*SHA512_DIGEST_WORD_SIZE +#define ROUNDS 80*SZ4 +#define SHA512_DIGEST_ROW_SIZE (SHA512_MB_MGR_NUM_LANES_AVX2 * 8) + +# START_FIELDS +.macro START_FIELDS + _FIELD_OFFSET = 0 + _STRUCT_ALIGN = 0 +.endm + +# FIELD name size align +.macro FIELD name size align + _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1)) + \name = _FIELD_OFFSET + _FIELD_OFFSET = _FIELD_OFFSET + (\size) +.if (\align > _STRUCT_ALIGN) + _STRUCT_ALIGN = \align +.endif +.endm + +# END_FIELDS +.macro END_FIELDS + _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +.endm + +.macro STRUCT p1 +START_FIELDS +.struc \p1 +.endm + +.macro ENDSTRUCT + tmp = _FIELD_OFFSET + END_FIELDS + tmp = (_FIELD_OFFSET - ##tmp) +.if (tmp > 0) + .lcomm tmp +.endm + +## RES_int name size align +.macro RES_int p1 p2 p3 + name = \p1 + size = \p2 + align = .\p3 + + _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1)) +.align align +.lcomm name size + _FIELD_OFFSET = _FIELD_OFFSET + (size) +.if (align > _STRUCT_ALIGN) + _STRUCT_ALIGN = align +.endif +.endm + +# macro RES_B name, size [, align] +.macro RES_B _name, _size, _align=1 +RES_int _name _size _align +.endm + +# macro RES_W name, size [, align] +.macro RES_W _name, _size, _align=2 +RES_int _name 2*(_size) _align +.endm + +# macro RES_D name, size [, align] +.macro RES_D _name, _size, _align=4 +RES_int _name 4*(_size) _align +.endm + +# macro RES_Q name, size [, align] +.macro RES_Q _name, _size, _align=8 +RES_int _name 8*(_size) _align +.endm + +# macro RES_DQ name, size [, align] +.macro RES_DQ _name, _size, _align=16 +RES_int _name 16*(_size) _align +.endm + +# macro RES_Y name, size [, align] +.macro RES_Y _name, _size, _align=32 +RES_int _name 32*(_size) _align +.endm + +# macro RES_Z name, size [, align] +.macro RES_Z _name, _size, _align=64 +RES_int _name 64*(_size) _align +.endm + +#endif + +################################################################### +### Define SHA512 Out Of Order Data Structures +################################################################### + +START_FIELDS # LANE_DATA +### name size align +FIELD _job_in_lane, 8, 8 # pointer to job object +END_FIELDS + + _LANE_DATA_size = _FIELD_OFFSET + _LANE_DATA_align = _STRUCT_ALIGN + +#################################################################### + +START_FIELDS # SHA512_ARGS_X4 +### name size align +FIELD _digest, 8*8*4, 4 # transposed digest +FIELD _data_ptr, 8*4, 8 # array of pointers to data +END_FIELDS + + _SHA512_ARGS_X4_size = _FIELD_OFFSET + _SHA512_ARGS_X4_align = _STRUCT_ALIGN + +##################################################################### + +START_FIELDS # MB_MGR +### name size align +FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align +FIELD _lens, 8*4, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*4, _LANE_DATA_align +END_FIELDS + + _MB_MGR_size = _FIELD_OFFSET + _MB_MGR_align = _STRUCT_ALIGN + +_args_digest = _args + _digest +_args_data_ptr = _args + _data_ptr + +####################################################################### + +####################################################################### +#### Define constants +####################################################################### + +#define STS_UNKNOWN 0 +#define STS_BEING_PROCESSED 1 +#define STS_COMPLETED 2 + +####################################################################### +#### Define JOB_SHA512 structure +####################################################################### + +START_FIELDS # JOB_SHA512 +### name size align +FIELD _buffer, 8, 8 # pointer to buffer +FIELD _len, 8, 8 # length in bytes +FIELD _result_digest, 8*8, 32 # Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + + _JOB_SHA512_size = _FIELD_OFFSET + _JOB_SHA512_align = _STRUCT_ALIGN diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S new file mode 100644 index 000000000000..3ddba19a0db6 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S @@ -0,0 +1,291 @@ +/* + * Flush routine for SHA512 multibuffer + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha512_mb_mgr_datastruct.S" + +.extern sha512_x4_avx2 + +# LINUX register definitions +#define arg1 %rdi +#define arg2 %rsi + +# idx needs to be other than arg1, arg2, rbx, r12 +#define idx %rdx + +# Common definitions +#define state arg1 +#define job arg2 +#define len2 arg2 + +#define unused_lanes %rbx +#define lane_data %rbx +#define tmp2 %rbx + +#define job_rax %rax +#define tmp1 %rax +#define size_offset %rax +#define tmp %rax +#define start_offset %rax + +#define tmp3 arg1 + +#define extra_blocks arg2 +#define p arg2 + +#define tmp4 %r8 +#define lens0 %r8 + +#define lens1 %r9 +#define lens2 %r10 +#define lens3 %r11 + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JNE_SKIP i +jne skip_\i +.endm + +.altmacro +.macro SET_OFFSET _offset +offset = \_offset +.endm +.noaltmacro + +# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state) +# arg 1 : rcx : state +ENTRY(sha512_mb_mgr_flush_avx2) + FRAME_BEGIN + push %rbx + + # If bit (32+3) is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $32+7, unused_lanes + jc return_null + + # find a lane with a non-null job + xor idx, idx + offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne one(%rip), idx + offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne two(%rip), idx + offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne three(%rip), idx + + # copy idx to empty lanes +copy_lane_data: + offset = (_args + _data_ptr) + mov offset(state,idx,8), tmp + + I = 0 +.rep 4 + offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) +.altmacro + JNE_SKIP %I + offset = (_args + _data_ptr + 8*I) + mov tmp, offset(state) + offset = (_lens + 8*I +4) + movl $0xFFFFFFFF, offset(state) +LABEL skip_ %I + I = (I+1) +.noaltmacro +.endr + + # Find min length + mov _lens + 0*8(state),lens0 + mov lens0,idx + mov _lens + 1*8(state),lens1 + cmp idx,lens1 + cmovb lens1,idx + mov _lens + 2*8(state),lens2 + cmp idx,lens2 + cmovb lens2,idx + mov _lens + 3*8(state),lens3 + cmp idx,lens3 + cmovb lens3,idx + mov idx,len2 + and $0xF,idx + and $~0xFF,len2 + jz len_is_0 + + sub len2, lens0 + sub len2, lens1 + sub len2, lens2 + sub len2, lens3 + shr $32,len2 + mov lens0, _lens + 0*8(state) + mov lens1, _lens + 1*8(state) + mov lens2, _lens + 2*8(state) + mov lens3, _lens + 3*8(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha512_x4_avx2 + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $8, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens+4(state, idx, 8) + + vmovq _args_digest+0*32(state, idx, 8), %xmm0 + vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0 + vmovq _args_digest+2*32(state, idx, 8), %xmm1 + vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1 + vmovq _args_digest+4*32(state, idx, 8), %xmm2 + vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2 + vmovq _args_digest+6*32(state, idx, 8), %xmm3 + vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3 + + vmovdqu %xmm0, _result_digest(job_rax) + vmovdqu %xmm1, _result_digest+1*16(job_rax) + vmovdqu %xmm2, _result_digest+2*16(job_rax) + vmovdqu %xmm3, _result_digest+3*16(job_rax) + +return: + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha512_mb_mgr_flush_avx2) +.align 16 + +ENTRY(sha512_mb_mgr_get_comp_job_avx2) + push %rbx + + mov _unused_lanes(state), unused_lanes + bt $(32+7), unused_lanes + jc .return_null + + # Find min length + mov _lens(state),lens0 + mov lens0,idx + mov _lens+1*8(state),lens1 + cmp idx,lens1 + cmovb lens1,idx + mov _lens+2*8(state),lens2 + cmp idx,lens2 + cmovb lens2,idx + mov _lens+3*8(state),lens3 + cmp idx,lens3 + cmovb lens3,idx + test $~0xF,idx + jnz .return_null + and $0xF,idx + + #process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $8, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens+4(state, idx, 8) + + vmovq _args_digest(state, idx, 8), %xmm0 + vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0 + vmovq _args_digest+2*32(state, idx, 8), %xmm1 + vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1 + vmovq _args_digest+4*32(state, idx, 8), %xmm2 + vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2 + vmovq _args_digest+6*32(state, idx, 8), %xmm3 + vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3 + + vmovdqu %xmm0, _result_digest+0*16(job_rax) + vmovdqu %xmm1, _result_digest+1*16(job_rax) + vmovdqu %xmm2, _result_digest+2*16(job_rax) + vmovdqu %xmm3, _result_digest+3*16(job_rax) + + pop %rbx + + ret + +.return_null: + xor job_rax, job_rax + pop %rbx + ret +ENDPROC(sha512_mb_mgr_get_comp_job_avx2) +.data + +.align 16 +one: +.quad 1 +two: +.quad 2 +three: +.quad 3 diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c new file mode 100644 index 000000000000..36870b26067a --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c @@ -0,0 +1,67 @@ +/* + * Initialization code for multi buffer SHA256 algorithm for AVX2 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sha512_mb_mgr.h" + +void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state) +{ + unsigned int j; + + state->lens[0] = 0; + state->lens[1] = 1; + state->lens[2] = 2; + state->lens[3] = 3; + state->unused_lanes = 0xFF03020100; + for (j = 0; j < 4; j++) + state->ldata[j].job_in_lane = NULL; +} diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S new file mode 100644 index 000000000000..815f07bdd1f8 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S @@ -0,0 +1,222 @@ +/* + * Buffer submit code for multi buffer SHA512 algorithm + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha512_mb_mgr_datastruct.S" + +.extern sha512_x4_avx2 + +#define arg1 %rdi +#define arg2 %rsi + +#define idx %rdx +#define last_len %rdx + +#define size_offset %rcx +#define tmp2 %rcx + +# Common definitions +#define state arg1 +#define job arg2 +#define len2 arg2 +#define p2 arg2 + +#define p %r11 +#define start_offset %r11 + +#define unused_lanes %rbx + +#define job_rax %rax +#define len %rax + +#define lane %r12 +#define tmp3 %r12 +#define lens3 %r12 + +#define extra_blocks %r8 +#define lens0 %r8 + +#define tmp %r9 +#define lens1 %r9 + +#define lane_data %r10 +#define lens2 %r10 + +#define DWORD_len %eax + +# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job) +# arg 1 : rcx : state +# arg 2 : rdx : job +ENTRY(sha512_mb_mgr_submit_avx2) + FRAME_BEGIN + push %rbx + push %r12 + + mov _unused_lanes(state), unused_lanes + movzb %bl,lane + shr $8, unused_lanes + imul $_LANE_DATA_size, lane,lane_data + movl $STS_BEING_PROCESSED, _status(job) + lea _ldata(state, lane_data), lane_data + mov unused_lanes, _unused_lanes(state) + movl _len(job), DWORD_len + + mov job, _job_in_lane(lane_data) + movl DWORD_len,_lens+4(state , lane, 8) + + # Load digest words from result_digest + vmovdqu _result_digest+0*16(job), %xmm0 + vmovdqu _result_digest+1*16(job), %xmm1 + vmovdqu _result_digest+2*16(job), %xmm2 + vmovdqu _result_digest+3*16(job), %xmm3 + + vmovq %xmm0, _args_digest(state, lane, 8) + vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8) + vmovq %xmm1, _args_digest+2*32(state , lane, 8) + vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8) + vmovq %xmm2, _args_digest+4*32(state , lane, 8) + vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8) + vmovq %xmm3, _args_digest+6*32(state , lane, 8) + vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8) + + mov _buffer(job), p + mov p, _args_data_ptr(state, lane, 8) + + cmp $0xFF, unused_lanes + jne return_null + +start_loop: + + # Find min length + mov _lens+0*8(state),lens0 + mov lens0,idx + mov _lens+1*8(state),lens1 + cmp idx,lens1 + cmovb lens1, idx + mov _lens+2*8(state),lens2 + cmp idx,lens2 + cmovb lens2,idx + mov _lens+3*8(state),lens3 + cmp idx,lens3 + cmovb lens3,idx + mov idx,len2 + and $0xF,idx + and $~0xFF,len2 + jz len_is_0 + + sub len2,lens0 + sub len2,lens1 + sub len2,lens2 + sub len2,lens3 + shr $32,len2 + mov lens0, _lens + 0*8(state) + mov lens1, _lens + 1*8(state) + mov lens2, _lens + 2*8(state) + mov lens3, _lens + 3*8(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha512_x4_avx2 + # state and idx are intact + +len_is_0: + + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + mov _unused_lanes(state), unused_lanes + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + shl $8, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF,_lens+4(state,idx,8) + vmovq _args_digest+0*32(state , idx, 8), %xmm0 + vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0 + vmovq _args_digest+2*32(state , idx, 8), %xmm1 + vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1 + vmovq _args_digest+4*32(state , idx, 8), %xmm2 + vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2 + vmovq _args_digest+6*32(state , idx, 8), %xmm3 + vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3 + + vmovdqu %xmm0, _result_digest + 0*16(job_rax) + vmovdqu %xmm1, _result_digest + 1*16(job_rax) + vmovdqu %xmm2, _result_digest + 2*16(job_rax) + vmovdqu %xmm3, _result_digest + 3*16(job_rax) + +return: + pop %r12 + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha512_mb_mgr_submit_avx2) +.data + +.align 16 +H0: .int 0x6a09e667 +H1: .int 0xbb67ae85 +H2: .int 0x3c6ef372 +H3: .int 0xa54ff53a +H4: .int 0x510e527f +H5: .int 0x9b05688c +H6: .int 0x1f83d9ab +H7: .int 0x5be0cd19 diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S new file mode 100644 index 000000000000..31ab1eff6413 --- /dev/null +++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S @@ -0,0 +1,529 @@ +/* + * Multi-buffer SHA512 algorithm hash compute routine + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Megha Dey <megha.dey@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# code to compute quad SHA512 using AVX2 +# use YMMs to tackle the larger digest size +# outer calling routine takes care of save and restore of XMM registers +# Logic designed/laid out by JDG + +# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +# Stack must be aligned to 32 bytes before call +# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12 +# Linux preserves: rcx rdx rdi rbp r13 r14 r15 +# clobbers ymm0-15 + +#include <linux/linkage.h> +#include "sha512_mb_mgr_datastruct.S" + +arg1 = %rdi +arg2 = %rsi + +# Common definitions +STATE = arg1 +INP_SIZE = arg2 + +IDX = %rax +ROUND = %rbx +TBL = %r8 + +inp0 = %r9 +inp1 = %r10 +inp2 = %r11 +inp3 = %r12 + +a = %ymm0 +b = %ymm1 +c = %ymm2 +d = %ymm3 +e = %ymm4 +f = %ymm5 +g = %ymm6 +h = %ymm7 + +a0 = %ymm8 +a1 = %ymm9 +a2 = %ymm10 + +TT0 = %ymm14 +TT1 = %ymm13 +TT2 = %ymm12 +TT3 = %ymm11 +TT4 = %ymm10 +TT5 = %ymm9 + +T1 = %ymm14 +TMP = %ymm15 + +# Define stack usage +STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24 + +#define VMOVPD vmovupd +_digest = SZ4*16 + +# transpose r0, r1, r2, r3, t0, t1 +# "transpose" data in {r0..r3} using temps {t0..t3} +# Input looks like: {r0 r1 r2 r3} +# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +# +# output looks like: {t0 r1 r0 r3} +# t0 = {d1 d0 c1 c0 b1 b0 a1 a0} +# r1 = {d3 d2 c3 c2 b3 b2 a3 a2} +# r0 = {d5 d4 c5 c4 b5 b4 a5 a4} +# r3 = {d7 d6 c7 c6 b7 b6 a7 a6} + +.macro TRANSPOSE r0 r1 r2 r3 t0 t1 + vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + + vperm2f128 $0x20, \r2, \r0, \r1 # h6...a6 + vperm2f128 $0x31, \r2, \r0, \r3 # h2...a2 + vperm2f128 $0x31, \t1, \t0, \r0 # h5...a5 + vperm2f128 $0x20, \t1, \t0, \t0 # h1...a1 +.endm + +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +# PRORQ reg, imm, tmp +# packed-rotate-right-double +# does a rotate by doing two shifts and an or +.macro _PRORQ reg imm tmp + vpsllq $(64-\imm),\reg,\tmp + vpsrlq $\imm,\reg, \reg + vpor \tmp,\reg, \reg +.endm + +# non-destructive +# PRORQ_nd reg, imm, tmp, src +.macro _PRORQ_nd reg imm tmp src + vpsllq $(64-\imm), \src, \tmp + vpsrlq $\imm, \src, \reg + vpor \tmp, \reg, \reg +.endm + +# PRORQ dst/src, amt +.macro PRORQ reg imm + _PRORQ \reg, \imm, TMP +.endm + +# PRORQ_nd dst, src, amt +.macro PRORQ_nd reg tmp imm + _PRORQ_nd \reg, \imm, TMP, \tmp +.endm + +#; arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_00_15 _T1 i + PRORQ_nd a0, e, (18-14) # sig1: a0 = (e >> 4) + + vpxor g, f, a2 # ch: a2 = f^g + vpand e,a2, a2 # ch: a2 = (f^g)&e + vpxor g, a2, a2 # a2 = ch + + PRORQ_nd a1,e,41 # sig1: a1 = (e >> 25) + + offset = SZ4*(\i & 0xf) + vmovdqu \_T1,offset(%rsp) + vpaddq (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K + vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 # sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddq a2, h, h # h = h + ch + PRORQ_nd a2,a,6 # sig0: a2 = (a >> 11) + vpaddq \_T1,h, h # h = h + ch + W + K + vpxor a1, a0, a0 # a0 = sigma1 + vmovdqu a,\_T1 + PRORQ_nd a1,a,39 # sig0: a1 = (a >> 22) + vpxor c, \_T1, \_T1 # maj: T1 = a^c + add $SZ4, ROUND # ROUND++ + vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b + vpaddq a0, h, h + vpaddq h, d, d + vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11) + PRORQ a2,28 # sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a1, a2, a2 # a2 = sig0 + vpand c, a, a1 # maj: a1 = a&c + vpor \_T1, a1, a1 # a1 = maj + vpaddq a1, h, h # h = h + ch + W + K + maj + vpaddq a2, h, h # h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS +.endm + + +#; arguments passed implicitly in preprocessor symbols i, a...h +.macro ROUND_16_XX _T1 i + vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1 + vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1 + vmovdqu \_T1, a0 + PRORQ \_T1,7 + vmovdqu a1, a2 + PRORQ a1,42 + vpxor a0, \_T1, \_T1 + PRORQ \_T1, 1 + vpxor a2, a1, a1 + PRORQ a1, 19 + vpsrlq $7, a0, a0 + vpxor a0, \_T1, \_T1 + vpsrlq $6, a2, a2 + vpxor a2, a1, a1 + vpaddq SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1 + vpaddq SZ4*((\i-7)&0xf)(%rsp), a1, a1 + vpaddq a1, \_T1, \_T1 + + ROUND_00_15 \_T1,\i +.endm + + +# void sha512_x4_avx2(void *STATE, const int INP_SIZE) +# arg 1 : STATE : pointer to input data +# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +ENTRY(sha512_x4_avx2) + # general registers preserved in outer calling routine + # outer calling routine saves all the XMM registers + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 + + sub $STACK_SPACE1, %rsp + + # Load the pre-transposed incoming digest. + vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a + vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b + vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c + vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d + vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e + vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f + vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g + vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h + + lea K512_4(%rip),TBL + + # load the address of each of the 4 message lanes + # getting ready to transpose input onto stack + mov _data_ptr+0*PTR_SZ(STATE),inp0 + mov _data_ptr+1*PTR_SZ(STATE),inp1 + mov _data_ptr+2*PTR_SZ(STATE),inp2 + mov _data_ptr+3*PTR_SZ(STATE),inp3 + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + # save old digest + vmovdqu a, _digest(%rsp) + vmovdqu b, _digest+1*SZ4(%rsp) + vmovdqu c, _digest+2*SZ4(%rsp) + vmovdqu d, _digest+3*SZ4(%rsp) + vmovdqu e, _digest+4*SZ4(%rsp) + vmovdqu f, _digest+5*SZ4(%rsp) + vmovdqu g, _digest+6*SZ4(%rsp) + vmovdqu h, _digest+7*SZ4(%rsp) + i = 0 +.rep 4 + vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP + VMOVPD i*32(inp0, IDX), TT2 + VMOVPD i*32(inp1, IDX), TT1 + VMOVPD i*32(inp2, IDX), TT4 + VMOVPD i*32(inp3, IDX), TT3 + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TMP, TT0, TT0 + vpshufb TMP, TT1, TT1 + vpshufb TMP, TT2, TT2 + vpshufb TMP, TT3, TT3 + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) + i = (i+1) +.endr + add $128, IDX + + i = (i*4) + + jmp Lrounds_16_xx +.align 16 +Lrounds_16_xx: +.rep 16 + ROUND_16_XX T1, i + i = (i+1) +.endr + cmp $0xa00,ROUND + jb Lrounds_16_xx + + # add old digest + vpaddq _digest(%rsp), a, a + vpaddq _digest+1*SZ4(%rsp), b, b + vpaddq _digest+2*SZ4(%rsp), c, c + vpaddq _digest+3*SZ4(%rsp), d, d + vpaddq _digest+4*SZ4(%rsp), e, e + vpaddq _digest+5*SZ4(%rsp), f, f + vpaddq _digest+6*SZ4(%rsp), g, g + vpaddq _digest+7*SZ4(%rsp), h, h + + sub $1, INP_SIZE # unit is blocks + jne lloop + + # write back to memory (state object) the transposed digest + vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE) + vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE) + + # update input data pointers + add IDX, inp0 + mov inp0, _data_ptr+0*PTR_SZ(STATE) + add IDX, inp1 + mov inp1, _data_ptr+1*PTR_SZ(STATE) + add IDX, inp2 + mov inp2, _data_ptr+2*PTR_SZ(STATE) + add IDX, inp3 + mov inp3, _data_ptr+3*PTR_SZ(STATE) + + #;;;;;;;;;;;;;;; + #; Postamble + add $STACK_SPACE1, %rsp + # restore callee-saved clobbered registers + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + # outer calling routine restores XMM and other GP registers + ret +ENDPROC(sha512_x4_avx2) + +.data +.align 64 +K512_4: + .octa 0x428a2f98d728ae22428a2f98d728ae22,\ + 0x428a2f98d728ae22428a2f98d728ae22 + .octa 0x7137449123ef65cd7137449123ef65cd,\ + 0x7137449123ef65cd7137449123ef65cd + .octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\ + 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f + .octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\ + 0xe9b5dba58189dbbce9b5dba58189dbbc + .octa 0x3956c25bf348b5383956c25bf348b538,\ + 0x3956c25bf348b5383956c25bf348b538 + .octa 0x59f111f1b605d01959f111f1b605d019,\ + 0x59f111f1b605d01959f111f1b605d019 + .octa 0x923f82a4af194f9b923f82a4af194f9b,\ + 0x923f82a4af194f9b923f82a4af194f9b + .octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\ + 0xab1c5ed5da6d8118ab1c5ed5da6d8118 + .octa 0xd807aa98a3030242d807aa98a3030242,\ + 0xd807aa98a3030242d807aa98a3030242 + .octa 0x12835b0145706fbe12835b0145706fbe,\ + 0x12835b0145706fbe12835b0145706fbe + .octa 0x243185be4ee4b28c243185be4ee4b28c,\ + 0x243185be4ee4b28c243185be4ee4b28c + .octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\ + 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2 + .octa 0x72be5d74f27b896f72be5d74f27b896f,\ + 0x72be5d74f27b896f72be5d74f27b896f + .octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\ + 0x80deb1fe3b1696b180deb1fe3b1696b1 + .octa 0x9bdc06a725c712359bdc06a725c71235,\ + 0x9bdc06a725c712359bdc06a725c71235 + .octa 0xc19bf174cf692694c19bf174cf692694,\ + 0xc19bf174cf692694c19bf174cf692694 + .octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\ + 0xe49b69c19ef14ad2e49b69c19ef14ad2 + .octa 0xefbe4786384f25e3efbe4786384f25e3,\ + 0xefbe4786384f25e3efbe4786384f25e3 + .octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\ + 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5 + .octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\ + 0x240ca1cc77ac9c65240ca1cc77ac9c65 + .octa 0x2de92c6f592b02752de92c6f592b0275,\ + 0x2de92c6f592b02752de92c6f592b0275 + .octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\ + 0x4a7484aa6ea6e4834a7484aa6ea6e483 + .octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\ + 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4 + .octa 0x76f988da831153b576f988da831153b5,\ + 0x76f988da831153b576f988da831153b5 + .octa 0x983e5152ee66dfab983e5152ee66dfab,\ + 0x983e5152ee66dfab983e5152ee66dfab + .octa 0xa831c66d2db43210a831c66d2db43210,\ + 0xa831c66d2db43210a831c66d2db43210 + .octa 0xb00327c898fb213fb00327c898fb213f,\ + 0xb00327c898fb213fb00327c898fb213f + .octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\ + 0xbf597fc7beef0ee4bf597fc7beef0ee4 + .octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\ + 0xc6e00bf33da88fc2c6e00bf33da88fc2 + .octa 0xd5a79147930aa725d5a79147930aa725,\ + 0xd5a79147930aa725d5a79147930aa725 + .octa 0x06ca6351e003826f06ca6351e003826f,\ + 0x06ca6351e003826f06ca6351e003826f + .octa 0x142929670a0e6e70142929670a0e6e70,\ + 0x142929670a0e6e70142929670a0e6e70 + .octa 0x27b70a8546d22ffc27b70a8546d22ffc,\ + 0x27b70a8546d22ffc27b70a8546d22ffc + .octa 0x2e1b21385c26c9262e1b21385c26c926,\ + 0x2e1b21385c26c9262e1b21385c26c926 + .octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\ + 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed + .octa 0x53380d139d95b3df53380d139d95b3df,\ + 0x53380d139d95b3df53380d139d95b3df + .octa 0x650a73548baf63de650a73548baf63de,\ + 0x650a73548baf63de650a73548baf63de + .octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\ + 0x766a0abb3c77b2a8766a0abb3c77b2a8 + .octa 0x81c2c92e47edaee681c2c92e47edaee6,\ + 0x81c2c92e47edaee681c2c92e47edaee6 + .octa 0x92722c851482353b92722c851482353b,\ + 0x92722c851482353b92722c851482353b + .octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\ + 0xa2bfe8a14cf10364a2bfe8a14cf10364 + .octa 0xa81a664bbc423001a81a664bbc423001,\ + 0xa81a664bbc423001a81a664bbc423001 + .octa 0xc24b8b70d0f89791c24b8b70d0f89791,\ + 0xc24b8b70d0f89791c24b8b70d0f89791 + .octa 0xc76c51a30654be30c76c51a30654be30,\ + 0xc76c51a30654be30c76c51a30654be30 + .octa 0xd192e819d6ef5218d192e819d6ef5218,\ + 0xd192e819d6ef5218d192e819d6ef5218 + .octa 0xd69906245565a910d69906245565a910,\ + 0xd69906245565a910d69906245565a910 + .octa 0xf40e35855771202af40e35855771202a,\ + 0xf40e35855771202af40e35855771202a + .octa 0x106aa07032bbd1b8106aa07032bbd1b8,\ + 0x106aa07032bbd1b8106aa07032bbd1b8 + .octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\ + 0x19a4c116b8d2d0c819a4c116b8d2d0c8 + .octa 0x1e376c085141ab531e376c085141ab53,\ + 0x1e376c085141ab531e376c085141ab53 + .octa 0x2748774cdf8eeb992748774cdf8eeb99,\ + 0x2748774cdf8eeb992748774cdf8eeb99 + .octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\ + 0x34b0bcb5e19b48a834b0bcb5e19b48a8 + .octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\ + 0x391c0cb3c5c95a63391c0cb3c5c95a63 + .octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\ + 0x4ed8aa4ae3418acb4ed8aa4ae3418acb + .octa 0x5b9cca4f7763e3735b9cca4f7763e373,\ + 0x5b9cca4f7763e3735b9cca4f7763e373 + .octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\ + 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3 + .octa 0x748f82ee5defb2fc748f82ee5defb2fc,\ + 0x748f82ee5defb2fc748f82ee5defb2fc + .octa 0x78a5636f43172f6078a5636f43172f60,\ + 0x78a5636f43172f6078a5636f43172f60 + .octa 0x84c87814a1f0ab7284c87814a1f0ab72,\ + 0x84c87814a1f0ab7284c87814a1f0ab72 + .octa 0x8cc702081a6439ec8cc702081a6439ec,\ + 0x8cc702081a6439ec8cc702081a6439ec + .octa 0x90befffa23631e2890befffa23631e28,\ + 0x90befffa23631e2890befffa23631e28 + .octa 0xa4506cebde82bde9a4506cebde82bde9,\ + 0xa4506cebde82bde9a4506cebde82bde9 + .octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\ + 0xbef9a3f7b2c67915bef9a3f7b2c67915 + .octa 0xc67178f2e372532bc67178f2e372532b,\ + 0xc67178f2e372532bc67178f2e372532b + .octa 0xca273eceea26619cca273eceea26619c,\ + 0xca273eceea26619cca273eceea26619c + .octa 0xd186b8c721c0c207d186b8c721c0c207,\ + 0xd186b8c721c0c207d186b8c721c0c207 + .octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\ + 0xeada7dd6cde0eb1eeada7dd6cde0eb1e + .octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\ + 0xf57d4f7fee6ed178f57d4f7fee6ed178 + .octa 0x06f067aa72176fba06f067aa72176fba,\ + 0x06f067aa72176fba06f067aa72176fba + .octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\ + 0x0a637dc5a2c898a60a637dc5a2c898a6 + .octa 0x113f9804bef90dae113f9804bef90dae,\ + 0x113f9804bef90dae113f9804bef90dae + .octa 0x1b710b35131c471b1b710b35131c471b,\ + 0x1b710b35131c471b1b710b35131c471b + .octa 0x28db77f523047d8428db77f523047d84,\ + 0x28db77f523047d8428db77f523047d84 + .octa 0x32caab7b40c7249332caab7b40c72493,\ + 0x32caab7b40c7249332caab7b40c72493 + .octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\ + 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc + .octa 0x431d67c49c100d4c431d67c49c100d4c,\ + 0x431d67c49c100d4c431d67c49c100d4c + .octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\ + 0x4cc5d4becb3e42b64cc5d4becb3e42b6 + .octa 0x597f299cfc657e2a597f299cfc657e2a,\ + 0x597f299cfc657e2a597f299cfc657e2a + .octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\ + 0x5fcb6fab3ad6faec5fcb6fab3ad6faec + .octa 0x6c44198c4a4758176c44198c4a475817,\ + 0x6c44198c4a4758176c44198c4a475817 + +PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 0b17c83d027d..2b0e2a6825f3 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -346,4 +346,10 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha512-ssse3"); +MODULE_ALIAS_CRYPTO("sha512-avx"); +MODULE_ALIAS_CRYPTO("sha512-avx2"); MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha384-ssse3"); +MODULE_ALIAS_CRYPTO("sha384-avx"); +MODULE_ALIAS_CRYPTO("sha384-avx2"); |