From e50944e219f908968a6e01fbd0e8811a33bd5f04 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Jun 2018 15:16:11 -0700 Subject: crypto: shash - remove useless setting of type flags Many shash algorithms set .cra_flags = CRYPTO_ALG_TYPE_SHASH. But this is redundant with the C structure type ('struct shash_alg'), and crypto_register_shash() already sets the type flag automatically, clearing any type flag that was already there. Apparently the useless assignment has just been copy+pasted around. So, remove the useless assignment from all the shash algorithms. This patch shouldn't change any actual behavior. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 3 --- arch/arm64/crypto/ghash-ce-glue.c | 1 - arch/arm64/crypto/sha1-ce-glue.c | 1 - arch/arm64/crypto/sha2-ce-glue.c | 2 -- arch/arm64/crypto/sha256-glue.c | 4 ---- arch/arm64/crypto/sha3-ce-glue.c | 4 ---- arch/arm64/crypto/sha512-ce-glue.c | 2 -- arch/arm64/crypto/sha512-glue.c | 2 -- arch/arm64/crypto/sm3-ce-glue.c | 1 - 9 files changed, 20 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 253188fb8cb0..a615a9a991a3 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -567,7 +567,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cmac(aes)", .base.cra_driver_name = "cmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -583,7 +582,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "xcbc(aes)", .base.cra_driver_name = "xcbc-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -599,7 +597,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cbcmac(aes)", .base.cra_driver_name = "cbcmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), .base.cra_module = THIS_MODULE, diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 7cf0b1aa6ea8..757fae186753 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -204,7 +204,6 @@ static struct shash_alg ghash_alg = { .base.cra_name = "ghash", .base.cra_driver_name = "ghash-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = GHASH_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct ghash_key), .base.cra_module = THIS_MODULE, diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index efbeb3e0dcfb..17fac2889f56 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -99,7 +99,6 @@ static struct shash_alg alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index fd1ff2b13dfa..261f5195cab7 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -114,7 +114,6 @@ static struct shash_alg algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -129,7 +128,6 @@ static struct shash_alg algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index e8880ccdc71f..f1b4f4420ca1 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -68,7 +68,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64", .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -81,7 +80,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64", .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; @@ -153,7 +151,6 @@ static struct shash_alg neon_algs[] = { { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64-neon", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -166,7 +163,6 @@ static struct shash_alg neon_algs[] = { { .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64-neon", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index da8222e528bd..a336feac0f59 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -105,7 +105,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-224", .base.cra_driver_name = "sha3-224-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -117,7 +116,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-256", .base.cra_driver_name = "sha3-256-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -129,7 +127,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-384", .base.cra_driver_name = "sha3-384-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -141,7 +138,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-512", .base.cra_driver_name = "sha3-512-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index a77c8632a589..f2c5f28c622a 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -87,7 +87,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha384", .base.cra_driver_name = "sha384-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -100,7 +99,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha512", .base.cra_driver_name = "sha512-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 27db4851e380..325b23b43a9b 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -63,7 +63,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha512", .base.cra_driver_name = "sha512-arm64", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -76,7 +75,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha384", .base.cra_driver_name = "sha384-arm64", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index 3b4948f7e26f..88938a20d9b2 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -72,7 +72,6 @@ static struct shash_alg sm3_alg = { .descsize = sizeof(struct sm3_state), .base.cra_name = "sm3", .base.cra_driver_name = "sm3-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SM3_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, -- cgit v1.2.3 From 6b0daa78207b622d768cce2a8251c830256bade9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 17 Jul 2018 10:09:26 -0700 Subject: crypto: arm64/sha256 - increase cra_priority of scalar implementations Commit b73b7ac0a774 ("crypto: sha256_generic - add cra_priority") gave sha256-generic and sha224-generic a cra_priority of 100, to match the convention for generic implementations. But sha256-arm64 and sha224-arm64 also have priority 100, so their order relative to the generic implementations became ambiguous. Therefore, increase their priority to 125 so that they have higher priority than the generic implementations but lower priority than the NEON implementations which have priority 150. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/sha256-glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index f1b4f4420ca1..4aedeaefd61f 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -67,7 +67,7 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha256_state), .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64", - .base.cra_priority = 100, + .base.cra_priority = 125, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -79,7 +79,7 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha256_state), .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64", - .base.cra_priority = 100, + .base.cra_priority = 125, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; -- cgit v1.2.3 From e4a1f7858ab834b2cdddfc51be2f5debae782029 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 23 Jul 2018 16:49:55 +0100 Subject: arm64: dts: hisi: add SEC crypto accelerator nodes for hip07 SoC Enable all 4 SEC units available on d05 boards. Signed-off-by: Jonathan Cameron Signed-off-by: Herbert Xu --- arch/arm64/boot/dts/hisilicon/hip07.dtsi | 284 +++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) (limited to 'arch/arm64') diff --git a/arch/arm64/boot/dts/hisilicon/hip07.dtsi b/arch/arm64/boot/dts/hisilicon/hip07.dtsi index 9c10030a07f8..c33adefc3061 100644 --- a/arch/arm64/boot/dts/hisilicon/hip07.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hip07.dtsi @@ -1049,7 +1049,74 @@ num-pins = <2>; }; }; + p0_mbigen_alg_a:interrupt-controller@d0080000 { + compatible = "hisilicon,mbigen-v2"; + reg = <0x0 0xd0080000 0x0 0x10000>; + p0_mbigen_sec_a: intc_sec { + msi-parent = <&p0_its_dsa_a 0x40400>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <33>; + }; + p0_mbigen_smmu_alg_a: intc_smmu_alg { + msi-parent = <&p0_its_dsa_a 0x40b1b>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <3>; + }; + }; + p0_mbigen_alg_b:interrupt-controller@8,d0080000 { + compatible = "hisilicon,mbigen-v2"; + reg = <0x8 0xd0080000 0x0 0x10000>; + + p0_mbigen_sec_b: intc_sec { + msi-parent = <&p0_its_dsa_b 0x42400>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <33>; + }; + p0_mbigen_smmu_alg_b: intc_smmu_alg { + msi-parent = <&p0_its_dsa_b 0x42b1b>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <3>; + }; + }; + p1_mbigen_alg_a:interrupt-controller@400,d0080000 { + compatible = "hisilicon,mbigen-v2"; + reg = <0x400 0xd0080000 0x0 0x10000>; + + p1_mbigen_sec_a: intc_sec { + msi-parent = <&p1_its_dsa_a 0x44400>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <33>; + }; + p1_mbigen_smmu_alg_a: intc_smmu_alg { + msi-parent = <&p1_its_dsa_a 0x44b1b>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <3>; + }; + }; + p1_mbigen_alg_b:interrupt-controller@408,d0080000 { + compatible = "hisilicon,mbigen-v2"; + reg = <0x408 0xd0080000 0x0 0x10000>; + + p1_mbigen_sec_b: intc_sec { + msi-parent = <&p1_its_dsa_b 0x46400>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <33>; + }; + p1_mbigen_smmu_alg_b: intc_smmu_alg { + msi-parent = <&p1_its_dsa_b 0x46b1b>; + interrupt-controller; + #interrupt-cells = <2>; + num-pins = <3>; + }; + }; p0_mbigen_dsa_a: interrupt-controller@c0080000 { compatible = "hisilicon,mbigen-v2"; reg = <0x0 0xc0080000 0x0 0x10000>; @@ -1107,6 +1174,58 @@ hisilicon,broken-prefetch-cmd; status = "disabled"; }; + p0_smmu_alg_a: smmu_alg@d0040000 { + compatible = "arm,smmu-v3"; + reg = <0x0 0xd0040000 0x0 0x20000>; + interrupt-parent = <&p0_mbigen_smmu_alg_a>; + interrupts = <733 1>, + <734 1>, + <735 1>; + interrupt-names = "eventq", "gerror", "priq"; + #iommu-cells = <1>; + dma-coherent; + hisilicon,broken-prefetch-cmd; + /* smmu-cb-memtype = <0x0 0x1>;*/ + }; + p0_smmu_alg_b: smmu_alg@8,d0040000 { + compatible = "arm,smmu-v3"; + reg = <0x8 0xd0040000 0x0 0x20000>; + interrupt-parent = <&p0_mbigen_smmu_alg_b>; + interrupts = <733 1>, + <734 1>, + <735 1>; + interrupt-names = "eventq", "gerror", "priq"; + #iommu-cells = <1>; + dma-coherent; + hisilicon,broken-prefetch-cmd; + /* smmu-cb-memtype = <0x0 0x1>;*/ + }; + p1_smmu_alg_a: smmu_alg@400,d0040000 { + compatible = "arm,smmu-v3"; + reg = <0x400 0xd0040000 0x0 0x20000>; + interrupt-parent = <&p1_mbigen_smmu_alg_a>; + interrupts = <733 1>, + <734 1>, + <735 1>; + interrupt-names = "eventq", "gerror", "priq"; + #iommu-cells = <1>; + dma-coherent; + hisilicon,broken-prefetch-cmd; + /* smmu-cb-memtype = <0x0 0x1>;*/ + }; + p1_smmu_alg_b: smmu_alg@408,d0040000 { + compatible = "arm,smmu-v3"; + reg = <0x408 0xd0040000 0x0 0x20000>; + interrupt-parent = <&p1_mbigen_smmu_alg_b>; + interrupts = <733 1>, + <734 1>, + <735 1>; + interrupt-names = "eventq", "gerror", "priq"; + #iommu-cells = <1>; + dma-coherent; + hisilicon,broken-prefetch-cmd; + /* smmu-cb-memtype = <0x0 0x1>;*/ + }; soc { compatible = "simple-bus"; @@ -1603,5 +1722,170 @@ 0x0 0 0 4 &mbigen_pcie2_a 671 4>; status = "disabled"; }; + p0_sec_a: crypto@d2000000 { + compatible = "hisilicon,hip07-sec"; + reg = <0x0 0xd0000000 0x0 0x10000 + 0x0 0xd2000000 0x0 0x10000 + 0x0 0xd2010000 0x0 0x10000 + 0x0 0xd2020000 0x0 0x10000 + 0x0 0xd2030000 0x0 0x10000 + 0x0 0xd2040000 0x0 0x10000 + 0x0 0xd2050000 0x0 0x10000 + 0x0 0xd2060000 0x0 0x10000 + 0x0 0xd2070000 0x0 0x10000 + 0x0 0xd2080000 0x0 0x10000 + 0x0 0xd2090000 0x0 0x10000 + 0x0 0xd20a0000 0x0 0x10000 + 0x0 0xd20b0000 0x0 0x10000 + 0x0 0xd20c0000 0x0 0x10000 + 0x0 0xd20d0000 0x0 0x10000 + 0x0 0xd20e0000 0x0 0x10000 + 0x0 0xd20f0000 0x0 0x10000 + 0x0 0xd2100000 0x0 0x10000>; + interrupt-parent = <&p0_mbigen_sec_a>; + iommus = <&p0_smmu_alg_a 0x600>; + dma-coherent; + interrupts = <576 4>, + <577 1>, <578 4>, + <579 1>, <580 4>, + <581 1>, <582 4>, + <583 1>, <584 4>, + <585 1>, <586 4>, + <587 1>, <588 4>, + <589 1>, <590 4>, + <591 1>, <592 4>, + <593 1>, <594 4>, + <595 1>, <596 4>, + <597 1>, <598 4>, + <599 1>, <600 4>, + <601 1>, <602 4>, + <603 1>, <604 4>, + <605 1>, <606 4>, + <607 1>, <608 4>; + }; + p0_sec_b: crypto@8,d2000000 { + compatible = "hisilicon,hip07-sec"; + reg = <0x8 0xd0000000 0x0 0x10000 + 0x8 0xd2000000 0x0 0x10000 + 0x8 0xd2010000 0x0 0x10000 + 0x8 0xd2020000 0x0 0x10000 + 0x8 0xd2030000 0x0 0x10000 + 0x8 0xd2040000 0x0 0x10000 + 0x8 0xd2050000 0x0 0x10000 + 0x8 0xd2060000 0x0 0x10000 + 0x8 0xd2070000 0x0 0x10000 + 0x8 0xd2080000 0x0 0x10000 + 0x8 0xd2090000 0x0 0x10000 + 0x8 0xd20a0000 0x0 0x10000 + 0x8 0xd20b0000 0x0 0x10000 + 0x8 0xd20c0000 0x0 0x10000 + 0x8 0xd20d0000 0x0 0x10000 + 0x8 0xd20e0000 0x0 0x10000 + 0x8 0xd20f0000 0x0 0x10000 + 0x8 0xd2100000 0x0 0x10000>; + interrupt-parent = <&p0_mbigen_sec_b>; + iommus = <&p0_smmu_alg_b 0x600>; + dma-coherent; + interrupts = <576 4>, + <577 1>, <578 4>, + <579 1>, <580 4>, + <581 1>, <582 4>, + <583 1>, <584 4>, + <585 1>, <586 4>, + <587 1>, <588 4>, + <589 1>, <590 4>, + <591 1>, <592 4>, + <593 1>, <594 4>, + <595 1>, <596 4>, + <597 1>, <598 4>, + <599 1>, <600 4>, + <601 1>, <602 4>, + <603 1>, <604 4>, + <605 1>, <606 4>, + <607 1>, <608 4>; + }; + p1_sec_a: crypto@400,d2000000 { + compatible = "hisilicon,hip07-sec"; + reg = <0x400 0xd0000000 0x0 0x10000 + 0x400 0xd2000000 0x0 0x10000 + 0x400 0xd2010000 0x0 0x10000 + 0x400 0xd2020000 0x0 0x10000 + 0x400 0xd2030000 0x0 0x10000 + 0x400 0xd2040000 0x0 0x10000 + 0x400 0xd2050000 0x0 0x10000 + 0x400 0xd2060000 0x0 0x10000 + 0x400 0xd2070000 0x0 0x10000 + 0x400 0xd2080000 0x0 0x10000 + 0x400 0xd2090000 0x0 0x10000 + 0x400 0xd20a0000 0x0 0x10000 + 0x400 0xd20b0000 0x0 0x10000 + 0x400 0xd20c0000 0x0 0x10000 + 0x400 0xd20d0000 0x0 0x10000 + 0x400 0xd20e0000 0x0 0x10000 + 0x400 0xd20f0000 0x0 0x10000 + 0x400 0xd2100000 0x0 0x10000>; + interrupt-parent = <&p1_mbigen_sec_a>; + iommus = <&p1_smmu_alg_a 0x600>; + dma-coherent; + interrupts = <576 4>, + <577 1>, <578 4>, + <579 1>, <580 4>, + <581 1>, <582 4>, + <583 1>, <584 4>, + <585 1>, <586 4>, + <587 1>, <588 4>, + <589 1>, <590 4>, + <591 1>, <592 4>, + <593 1>, <594 4>, + <595 1>, <596 4>, + <597 1>, <598 4>, + <599 1>, <600 4>, + <601 1>, <602 4>, + <603 1>, <604 4>, + <605 1>, <606 4>, + <607 1>, <608 4>; + }; + p1_sec_b: crypto@408,d2000000 { + compatible = "hisilicon,hip07-sec"; + reg = <0x408 0xd0000000 0x0 0x10000 + 0x408 0xd2000000 0x0 0x10000 + 0x408 0xd2010000 0x0 0x10000 + 0x408 0xd2020000 0x0 0x10000 + 0x408 0xd2030000 0x0 0x10000 + 0x408 0xd2040000 0x0 0x10000 + 0x408 0xd2050000 0x0 0x10000 + 0x408 0xd2060000 0x0 0x10000 + 0x408 0xd2070000 0x0 0x10000 + 0x408 0xd2080000 0x0 0x10000 + 0x408 0xd2090000 0x0 0x10000 + 0x408 0xd20a0000 0x0 0x10000 + 0x408 0xd20b0000 0x0 0x10000 + 0x408 0xd20c0000 0x0 0x10000 + 0x408 0xd20d0000 0x0 0x10000 + 0x408 0xd20e0000 0x0 0x10000 + 0x408 0xd20f0000 0x0 0x10000 + 0x408 0xd2100000 0x0 0x10000>; + interrupt-parent = <&p1_mbigen_sec_b>; + iommus = <&p1_smmu_alg_b 0x600>; + dma-coherent; + interrupts = <576 4>, + <577 1>, <578 4>, + <579 1>, <580 4>, + <581 1>, <582 4>, + <583 1>, <584 4>, + <585 1>, <586 4>, + <587 1>, <588 4>, + <589 1>, <590 4>, + <591 1>, <592 4>, + <593 1>, <594 4>, + <595 1>, <596 4>, + <597 1>, <598 4>, + <599 1>, <600 4>, + <601 1>, <602 4>, + <603 1>, <604 4>, + <605 1>, <606 4>, + <607 1>, <608 4>; + }; + }; }; -- cgit v1.2.3 From 71e52c278c54db10e368c54687234390357b08d6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Jul 2018 23:06:40 +0200 Subject: crypto: arm64/aes-ce-gcm - operate on two input blocks at a time Update the core AES/GCM transform and the associated plumbing to operate on 2 AES/GHASH blocks at a time. By itself, this is not expected to result in a noticeable speedup, but it paves the way for reimplementing the GHASH component using 2-way aggregation. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 127 +++++++++++++++++++++++++++++--------- arch/arm64/crypto/ghash-ce-glue.c | 103 +++++++++++++++++++------------ 2 files changed, 161 insertions(+), 69 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index c723647b37db..dac0df29d194 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -286,9 +286,10 @@ ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) - KS .req v8 - CTR .req v9 - INP .req v10 + KS0 .req v8 + KS1 .req v9 + INP0 .req v10 + INP1 .req v11 .macro load_round_keys, rounds, rk cmp \rounds, #12 @@ -336,84 +337,146 @@ CPU_LE( rev x8, x8 ) .if \enc == 1 ldr x10, [sp] - ld1 {KS.16b}, [x10] + ld1 {KS0.16b-KS1.16b}, [x10] .endif -0: ld1 {CTR.8b}, [x5] // load upper counter - ld1 {INP.16b}, [x3], #16 +0: ld1 {INP0.16b-INP1.16b}, [x3], #32 + rev x9, x8 - add x8, x8, #1 - sub w0, w0, #1 - ins CTR.d[1], x9 // set lower counter + add x11, x8, #1 + add x8, x8, #2 .if \enc == 1 - eor INP.16b, INP.16b, KS.16b // encrypt input - st1 {INP.16b}, [x2], #16 + eor INP0.16b, INP0.16b, KS0.16b // encrypt input + eor INP1.16b, INP1.16b, KS1.16b .endif - rev64 T1.16b, INP.16b + ld1 {KS0.8b}, [x5] // load upper counter + rev x11, x11 + sub w0, w0, #2 + mov KS1.8b, KS0.8b + ins KS0.d[1], x9 // set lower counter + ins KS1.d[1], x11 + + rev64 T1.16b, INP0.16b cmp w7, #12 b.ge 2f // AES-192/256? -1: enc_round CTR, v21 +1: enc_round KS0, v21 ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 - enc_round CTR, v22 + enc_round KS1, v21 eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b - enc_round CTR, v23 + enc_round KS0, v22 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 eor T1.16b, T1.16b, XL.16b - enc_round CTR, v24 + enc_round KS1, v22 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) - enc_round CTR, v25 + enc_round KS0, v23 ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b - enc_round CTR, v26 + enc_round KS1, v23 eor XM.16b, XM.16b, T2.16b pmull T2.1q, XL.1d, MASK.1d - enc_round CTR, v27 + enc_round KS0, v24 mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] - enc_round CTR, v28 + enc_round KS1, v24 eor XL.16b, XM.16b, T2.16b - enc_round CTR, v29 + enc_round KS0, v25 ext T2.16b, XL.16b, XL.16b, #8 - aese CTR.16b, v30.16b + enc_round KS1, v25 pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b - eor KS.16b, CTR.16b, v31.16b + enc_round KS0, v26 + + eor XL.16b, XL.16b, T2.16b + rev64 T1.16b, INP1.16b + + enc_round KS1, v26 + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + + enc_round KS0, v27 + + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + enc_round KS1, v27 + + pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + + enc_round KS0, v28 + + pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 + pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + + enc_round KS1, v28 + + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + + enc_round KS0, v29 + + eor XM.16b, XM.16b, T2.16b + pmull T2.1q, XL.1d, MASK.1d + + enc_round KS1, v29 + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + aese KS0.16b, v30.16b + + eor XL.16b, XM.16b, T2.16b + + aese KS1.16b, v30.16b + + ext T2.16b, XL.16b, XL.16b, #8 + + eor KS0.16b, KS0.16b, v31.16b + + pmull XL.1q, XL.1d, MASK.1d + eor T2.16b, T2.16b, XH.16b + + eor KS1.16b, KS1.16b, v31.16b eor XL.16b, XL.16b, T2.16b .if \enc == 0 - eor INP.16b, INP.16b, KS.16b - st1 {INP.16b}, [x2], #16 + eor INP0.16b, INP0.16b, KS0.16b + eor INP1.16b, INP1.16b, KS1.16b .endif + st1 {INP0.16b-INP1.16b}, [x2], #32 + cbnz w0, 0b CPU_LE( rev x8, x8 ) @@ -421,16 +484,20 @@ CPU_LE( rev x8, x8 ) str x8, [x5, #8] // store lower counter .if \enc == 1 - st1 {KS.16b}, [x10] + st1 {KS0.16b-KS1.16b}, [x10] .endif ret 2: b.eq 3f // AES-192? - enc_round CTR, v17 - enc_round CTR, v18 -3: enc_round CTR, v19 - enc_round CTR, v20 + enc_round KS0, v17 + enc_round KS1, v17 + enc_round KS0, v18 + enc_round KS1, v18 +3: enc_round KS0, v19 + enc_round KS1, v19 + enc_round KS0, v20 + enc_round KS1, v20 b 1b .endm diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 18b4d1b96a7e..8ff6732c4fb5 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -348,9 +348,10 @@ static int gcm_encrypt(struct aead_request *req) struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); struct skcipher_walk walk; u8 iv[AES_BLOCK_SIZE]; - u8 ks[AES_BLOCK_SIZE]; + u8 ks[2 * AES_BLOCK_SIZE]; u8 tag[AES_BLOCK_SIZE]; u64 dg[2] = {}; + int nrounds = num_rounds(&ctx->aes_key); int err; if (req->assoclen) @@ -362,32 +363,31 @@ static int gcm_encrypt(struct aead_request *req) if (likely(may_use_simd())) { kernel_neon_begin(); - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - pmull_gcm_encrypt_block(ks, iv, NULL, - num_rounds(&ctx->aes_key)); + pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); put_unaligned_be32(3, iv + GCM_IV_SIZE); + pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); + put_unaligned_be32(4, iv + GCM_IV_SIZE); kernel_neon_end(); err = skcipher_walk_aead_encrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + while (walk.nbytes >= 2 * AES_BLOCK_SIZE) { + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; kernel_neon_begin(); pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key), ks); + iv, ctx->aes_key.key_enc, nrounds, + ks); kernel_neon_end(); err = skcipher_walk_done(&walk, - walk.nbytes % AES_BLOCK_SIZE); + walk.nbytes % (2 * AES_BLOCK_SIZE)); } } else { - __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, - num_rounds(&ctx->aes_key)); + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, false); @@ -399,8 +399,7 @@ static int gcm_encrypt(struct aead_request *req) do { __aes_arm64_encrypt(ctx->aes_key.key_enc, - ks, iv, - num_rounds(&ctx->aes_key)); + ks, iv, nrounds); crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); @@ -417,19 +416,28 @@ static int gcm_encrypt(struct aead_request *req) } if (walk.nbytes) __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv, - num_rounds(&ctx->aes_key)); + nrounds); } /* handle the tail */ if (walk.nbytes) { u8 buf[GHASH_BLOCK_SIZE]; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 *head = NULL; crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, walk.nbytes); - memcpy(buf, walk.dst.virt.addr, walk.nbytes); - memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + if (walk.nbytes > GHASH_BLOCK_SIZE) { + head = dst; + dst += GHASH_BLOCK_SIZE; + nbytes %= GHASH_BLOCK_SIZE; + } + + memcpy(buf, dst, nbytes); + memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); + ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head); err = skcipher_walk_done(&walk, 0); } @@ -452,10 +460,11 @@ static int gcm_decrypt(struct aead_request *req) struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); unsigned int authsize = crypto_aead_authsize(aead); struct skcipher_walk walk; - u8 iv[AES_BLOCK_SIZE]; + u8 iv[2 * AES_BLOCK_SIZE]; u8 tag[AES_BLOCK_SIZE]; - u8 buf[GHASH_BLOCK_SIZE]; + u8 buf[2 * GHASH_BLOCK_SIZE]; u64 dg[2] = {}; + int nrounds = num_rounds(&ctx->aes_key); int err; if (req->assoclen) @@ -466,37 +475,44 @@ static int gcm_decrypt(struct aead_request *req) if (likely(may_use_simd())) { kernel_neon_begin(); - - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); kernel_neon_end(); err = skcipher_walk_aead_decrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + while (walk.nbytes >= 2 * AES_BLOCK_SIZE) { + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; kernel_neon_begin(); pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + iv, ctx->aes_key.key_enc, nrounds); kernel_neon_end(); err = skcipher_walk_done(&walk, - walk.nbytes % AES_BLOCK_SIZE); + walk.nbytes % (2 * AES_BLOCK_SIZE)); } + if (walk.nbytes) { + u8 *iv2 = iv + AES_BLOCK_SIZE; + + if (walk.nbytes > AES_BLOCK_SIZE) { + memcpy(iv2, iv, AES_BLOCK_SIZE); + crypto_inc(iv2, AES_BLOCK_SIZE); + } + kernel_neon_begin(); pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + nrounds); + + if (walk.nbytes > AES_BLOCK_SIZE) + pmull_gcm_encrypt_block(iv2, iv2, NULL, + nrounds); kernel_neon_end(); } - } else { - __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, - num_rounds(&ctx->aes_key)); + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, false); @@ -511,8 +527,7 @@ static int gcm_decrypt(struct aead_request *req) do { __aes_arm64_encrypt(ctx->aes_key.key_enc, - buf, iv, - num_rounds(&ctx->aes_key)); + buf, iv, nrounds); crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); @@ -525,14 +540,24 @@ static int gcm_decrypt(struct aead_request *req) } if (walk.nbytes) __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv, - num_rounds(&ctx->aes_key)); + nrounds); } /* handle the tail */ if (walk.nbytes) { - memcpy(buf, walk.src.virt.addr, walk.nbytes); - memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + const u8 *src = walk.src.virt.addr; + const u8 *head = NULL; + unsigned int nbytes = walk.nbytes; + + if (walk.nbytes > GHASH_BLOCK_SIZE) { + head = src; + src += GHASH_BLOCK_SIZE; + nbytes %= GHASH_BLOCK_SIZE; + } + + memcpy(buf, src, nbytes); + memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); + ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head); crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, walk.nbytes); @@ -557,7 +582,7 @@ static int gcm_decrypt(struct aead_request *req) static struct aead_alg gcm_aes_alg = { .ivsize = GCM_IV_SIZE, - .chunksize = AES_BLOCK_SIZE, + .chunksize = 2 * AES_BLOCK_SIZE, .maxauthsize = AES_BLOCK_SIZE, .setkey = gcm_setkey, .setauthsize = gcm_setauthsize, -- cgit v1.2.3 From e0bd888dc487e0c444ee5f3bf55020862d16a225 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Jul 2018 23:06:41 +0200 Subject: crypto: arm64/aes-ce-gcm - implement 2-way aggregation Implement a faster version of the GHASH transform which amortizes the reduction modulo the characteristic polynomial across two input blocks at a time. On a Cortex-A53, the gcm(aes) performance increases 24%, from 3.0 cycles per byte to 2.4 cpb for large input sizes. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 86 +++++++++++++-------------------------- arch/arm64/crypto/ghash-ce-glue.c | 34 +++++++++++----- 2 files changed, 52 insertions(+), 68 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dac0df29d194..f7281e7a592f 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -290,6 +290,10 @@ ENDPROC(pmull_ghash_update_p8) KS1 .req v9 INP0 .req v10 INP1 .req v11 + HH .req v12 + XL2 .req v13 + XM2 .req v14 + XH2 .req v15 .macro load_round_keys, rounds, rk cmp \rounds, #12 @@ -323,6 +327,7 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc + ld1 {HH.2d}, [x4], #16 ld1 {SHASH.2d}, [x4] ld1 {XL.2d}, [x1] ldr x8, [x5, #8] // load lower counter @@ -330,10 +335,11 @@ ENDPROC(pmull_ghash_update_p8) load_round_keys w7, x6 movi MASK.16b, #0xe1 - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 - eor SHASH2.16b, SHASH2.16b, SHASH.16b + eor SHASH2.16b, SHASH2.16b, T1.16b .if \enc == 1 ldr x10, [sp] @@ -358,116 +364,82 @@ CPU_LE( rev x8, x8 ) ins KS0.d[1], x9 // set lower counter ins KS1.d[1], x11 - rev64 T1.16b, INP0.16b + rev64 T1.16b, INP1.16b cmp w7, #12 b.ge 2f // AES-192/256? 1: enc_round KS0, v21 - - ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 enc_round KS1, v21 - - eor T1.16b, T1.16b, T2.16b - eor XL.16b, XL.16b, IN1.16b + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 enc_round KS0, v22 - - pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 - eor T1.16b, T1.16b, XL.16b + eor T1.16b, T1.16b, IN1.16b enc_round KS1, v22 - - pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 - pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 enc_round KS0, v23 - - ext T1.16b, XL.16b, XH.16b, #8 - eor T2.16b, XL.16b, XH.16b - eor XM.16b, XM.16b, T1.16b + pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) enc_round KS1, v23 - - eor XM.16b, XM.16b, T2.16b - pmull T2.1q, XL.1d, MASK.1d + rev64 T1.16b, INP0.16b + ext T2.16b, XL.16b, XL.16b, #8 enc_round KS0, v24 - - mov XH.d[0], XM.d[1] - mov XM.d[1], XL.d[0] + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b enc_round KS1, v24 - - eor XL.16b, XM.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b enc_round KS0, v25 - - ext T2.16b, XL.16b, XL.16b, #8 + eor T1.16b, T1.16b, XL.16b enc_round KS1, v25 - - pmull XL.1q, XL.1d, MASK.1d - eor T2.16b, T2.16b, XH.16b + pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 enc_round KS0, v26 - - eor XL.16b, XL.16b, T2.16b - rev64 T1.16b, INP1.16b + pmull XL.1q, HH.1d, XL.1d // a0 * b0 enc_round KS1, v26 - - ext T2.16b, XL.16b, XL.16b, #8 - ext IN1.16b, T1.16b, T1.16b, #8 + pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) enc_round KS0, v27 - - eor T1.16b, T1.16b, T2.16b - eor XL.16b, XL.16b, IN1.16b + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b enc_round KS1, v27 - - pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 - eor T1.16b, T1.16b, XL.16b + eor XM.16b, XM.16b, XM2.16b + ext T1.16b, XL.16b, XH.16b, #8 enc_round KS0, v28 - - pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 - pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) - - enc_round KS1, v28 - - ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b - enc_round KS0, v29 - + enc_round KS1, v28 eor XM.16b, XM.16b, T2.16b + + enc_round KS0, v29 pmull T2.1q, XL.1d, MASK.1d enc_round KS1, v29 - mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] aese KS0.16b, v30.16b - eor XL.16b, XM.16b, T2.16b aese KS1.16b, v30.16b - ext T2.16b, XL.16b, XL.16b, #8 eor KS0.16b, KS0.16b, v31.16b - pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b eor KS1.16b, KS1.16b, v31.16b - eor XL.16b, XL.16b, T2.16b .if \enc == 0 diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 8ff6732c4fb5..cd91b146c87d 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -46,6 +46,7 @@ struct ghash_desc_ctx { struct gcm_aes_ctx { struct crypto_aes_ctx aes_key; + u64 h2[2]; struct ghash_key ghash_key; }; @@ -62,12 +63,11 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, const char *head); asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], - const u8 src[], struct ghash_key const *k, - u8 ctr[], u32 const rk[], int rounds, - u8 ks[]); + const u8 src[], u64 const *k, u8 ctr[], + u32 const rk[], int rounds, u8 ks[]); asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], - const u8 src[], struct ghash_key const *k, + const u8 src[], u64 const *k, u8 ctr[], u32 const rk[], int rounds); asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], @@ -232,7 +232,8 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, unsigned int keylen) { struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); - u8 key[GHASH_BLOCK_SIZE]; + be128 h1, h2; + u8 *key = (u8 *)&h1; int ret; ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen); @@ -244,7 +245,19 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, num_rounds(&ctx->aes_key)); - return __ghash_setkey(&ctx->ghash_key, key, sizeof(key)); + __ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); + + /* calculate H^2 (used for 2-way aggregation) */ + h2 = h1; + gf128mul_lle(&h2, &h1); + + ctx->h2[0] = (be64_to_cpu(h2.b) << 1) | (be64_to_cpu(h2.a) >> 63); + ctx->h2[1] = (be64_to_cpu(h2.a) << 1) | (be64_to_cpu(h2.b) >> 63); + + if (be64_to_cpu(h2.a) >> 63) + ctx->h2[1] ^= 0xc200000000000000UL; + + return 0; } static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) @@ -378,9 +391,8 @@ static int gcm_encrypt(struct aead_request *req) kernel_neon_begin(); pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, - walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, nrounds, - ks); + walk.src.virt.addr, ctx->h2, iv, + ctx->aes_key.key_enc, nrounds, ks); kernel_neon_end(); err = skcipher_walk_done(&walk, @@ -486,8 +498,8 @@ static int gcm_decrypt(struct aead_request *req) kernel_neon_begin(); pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, - walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, nrounds); + walk.src.virt.addr, ctx->h2, iv, + ctx->aes_key.key_enc, nrounds); kernel_neon_end(); err = skcipher_walk_done(&walk, -- cgit v1.2.3 From 30f1a9f53e77e4c9ddf55ebfda8a9d7666e46964 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Jul 2018 23:06:42 +0200 Subject: crypto: arm64/aes-ce-gcm - don't reload key schedule if avoidable Squeeze out another 5% of performance by minimizing the number of invocations of kernel_neon_begin()/kernel_neon_end() on the common path, which also allows some reloads of the key schedule to be optimized away. The resulting code runs at 2.3 cycles per byte on a Cortex-A53. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 9 +++-- arch/arm64/crypto/ghash-ce-glue.c | 81 +++++++++++++++++++++------------------ 2 files changed, 49 insertions(+), 41 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index f7281e7a592f..913e49932ae6 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 - 2017 Linaro Ltd. + * Copyright (C) 2014 - 2018 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -332,8 +332,6 @@ ENDPROC(pmull_ghash_update_p8) ld1 {XL.2d}, [x1] ldr x8, [x5, #8] // load lower counter - load_round_keys w7, x6 - movi MASK.16b, #0xe1 trn1 SHASH2.2d, SHASH.2d, HH.2d trn2 T1.2d, SHASH.2d, HH.2d @@ -346,6 +344,8 @@ CPU_LE( rev x8, x8 ) ld1 {KS0.16b-KS1.16b}, [x10] .endif + cbnz x6, 4f + 0: ld1 {INP0.16b-INP1.16b}, [x3], #32 rev x9, x8 @@ -471,6 +471,9 @@ CPU_LE( rev x8, x8 ) enc_round KS0, v20 enc_round KS1, v20 b 1b + +4: load_round_keys w7, x6 + b 0b .endm /* diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index cd91b146c87d..42a0e84e276c 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 - 2017 Linaro Ltd. + * Copyright (C) 2014 - 2018 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -373,37 +373,39 @@ static int gcm_encrypt(struct aead_request *req) memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(1, iv + GCM_IV_SIZE); - if (likely(may_use_simd())) { - kernel_neon_begin(); + err = skcipher_walk_aead_encrypt(&walk, req, false); + if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + u32 const *rk = NULL; + + kernel_neon_begin(); pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); put_unaligned_be32(3, iv + GCM_IV_SIZE); pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); put_unaligned_be32(4, iv + GCM_IV_SIZE); - kernel_neon_end(); - - err = skcipher_walk_aead_encrypt(&walk, req, false); - while (walk.nbytes >= 2 * AES_BLOCK_SIZE) { + do { int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; - kernel_neon_begin(); + if (rk) + kernel_neon_begin(); + pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, ctx->h2, iv, - ctx->aes_key.key_enc, nrounds, ks); + rk, nrounds, ks); kernel_neon_end(); err = skcipher_walk_done(&walk, walk.nbytes % (2 * AES_BLOCK_SIZE)); - } + + rk = ctx->aes_key.key_enc; + } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); } else { __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_encrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { int blocks = walk.nbytes / AES_BLOCK_SIZE; u8 *dst = walk.dst.virt.addr; @@ -485,50 +487,53 @@ static int gcm_decrypt(struct aead_request *req) memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(1, iv + GCM_IV_SIZE); - if (likely(may_use_simd())) { + err = skcipher_walk_aead_decrypt(&walk, req, false); + + if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + u32 const *rk = NULL; + kernel_neon_begin(); pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - kernel_neon_end(); - err = skcipher_walk_aead_decrypt(&walk, req, false); - - while (walk.nbytes >= 2 * AES_BLOCK_SIZE) { + do { int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; + int rem = walk.total - blocks * AES_BLOCK_SIZE; + + if (rk) + kernel_neon_begin(); - kernel_neon_begin(); pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, ctx->h2, iv, - ctx->aes_key.key_enc, nrounds); - kernel_neon_end(); + rk, nrounds); - err = skcipher_walk_done(&walk, - walk.nbytes % (2 * AES_BLOCK_SIZE)); - } + /* check if this is the final iteration of the loop */ + if (rem < (2 * AES_BLOCK_SIZE)) { + u8 *iv2 = iv + AES_BLOCK_SIZE; - if (walk.nbytes) { - u8 *iv2 = iv + AES_BLOCK_SIZE; + if (rem > AES_BLOCK_SIZE) { + memcpy(iv2, iv, AES_BLOCK_SIZE); + crypto_inc(iv2, AES_BLOCK_SIZE); + } - if (walk.nbytes > AES_BLOCK_SIZE) { - memcpy(iv2, iv, AES_BLOCK_SIZE); - crypto_inc(iv2, AES_BLOCK_SIZE); - } + pmull_gcm_encrypt_block(iv, iv, NULL, nrounds); - kernel_neon_begin(); - pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc, - nrounds); + if (rem > AES_BLOCK_SIZE) + pmull_gcm_encrypt_block(iv2, iv2, NULL, + nrounds); + } - if (walk.nbytes > AES_BLOCK_SIZE) - pmull_gcm_encrypt_block(iv2, iv2, NULL, - nrounds); kernel_neon_end(); - } + + err = skcipher_walk_done(&walk, + walk.nbytes % (2 * AES_BLOCK_SIZE)); + + rk = ctx->aes_key.key_enc; + } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); } else { __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_decrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { int blocks = walk.nbytes / AES_BLOCK_SIZE; u8 *dst = walk.dst.virt.addr; -- cgit v1.2.3 From 8e492eff7de955e6ed1dc2989b17c41cd862aa28 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 4 Aug 2018 20:46:24 +0200 Subject: crypto: arm64/ghash-ce - replace NEON yield check with block limit Checking the TIF_NEED_RESCHED flag is disproportionately costly on cores with fast crypto instructions and comparatively slow memory accesses. On algorithms such as GHASH, which executes at ~1 cycle per byte on cores that implement support for 64 bit polynomial multiplication, there is really no need to check the TIF_NEED_RESCHED particularly often, and so we can remove the NEON yield check from the assembler routines. However, unlike the AEAD or skcipher APIs, the shash/ahash APIs take arbitrary input lengths, and so there needs to be some sanity check to ensure that we don't hog the CPU for excessive amounts of time. So let's simply cap the maximum input size that is processed in one go to 64 KB. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 39 +++++++++++---------------------------- arch/arm64/crypto/ghash-ce-glue.c | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 32 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index 913e49932ae6..344811c6a0ca 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -213,31 +213,23 @@ .endm .macro __pmull_ghash, pn - frame_push 5 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - -0: ld1 {SHASH.2d}, [x22] - ld1 {XL.2d}, [x20] + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 eor SHASH2.16b, SHASH2.16b, SHASH.16b __pmull_pre_\pn /* do the head block first, if supplied */ - cbz x23, 1f - ld1 {T1.2d}, [x23] - mov x23, xzr - b 2f + cbz x4, 0f + ld1 {T1.2d}, [x4] + mov x4, xzr + b 1f -1: ld1 {T1.2d}, [x21], #16 - sub w19, w19, #1 +0: ld1 {T1.2d}, [x2], #16 + sub w0, w0, #1 -2: /* multiply XL by SHASH in GF(2^128) */ +1: /* multiply XL by SHASH in GF(2^128) */ CPU_LE( rev64 T1.16b, T1.16b ) ext T2.16b, XL.16b, XL.16b, #8 @@ -259,18 +251,9 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b - cbz w19, 3f - - if_will_cond_yield_neon - st1 {XL.2d}, [x20] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b + cbnz w0, 0b -3: st1 {XL.2d}, [x20] - frame_pop + st1 {XL.2d}, [x1] ret .endm diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 42a0e84e276c..3c2c446dc96c 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -113,6 +113,9 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, } } +/* avoid hogging the CPU for too long */ +#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE) + static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { @@ -136,11 +139,16 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - ghash_do_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); + do { + int chunk = min(blocks, MAX_BLOCKS); + + ghash_do_update(chunk, ctx->digest, src, key, + partial ? ctx->buf : NULL); - src += blocks * GHASH_BLOCK_SIZE; - partial = 0; + blocks -= chunk; + src += chunk * GHASH_BLOCK_SIZE; + partial = 0; + } while (unlikely(blocks > 0)); } if (len) memcpy(ctx->buf + partial, src, len); -- cgit v1.2.3 From 22240df7ac6d76a271197571a7be45addef2ba15 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 4 Aug 2018 20:46:25 +0200 Subject: crypto: arm64/ghash-ce - implement 4-way aggregation Enhance the GHASH implementation that uses 64-bit polynomial multiplication by adding support for 4-way aggregation. This more than doubles the performance, from 2.4 cycles per byte to 1.1 cpb on Cortex-A53. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 122 ++++++++++++++++++++++++++++++++------ arch/arm64/crypto/ghash-ce-glue.c | 71 +++++++++++----------- 2 files changed, 142 insertions(+), 51 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index 344811c6a0ca..1b319b716d5e 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -46,6 +46,19 @@ ss3 .req v26 ss4 .req v27 + XL2 .req v8 + XM2 .req v9 + XH2 .req v10 + XL3 .req v11 + XM3 .req v12 + XH3 .req v13 + TT3 .req v14 + TT4 .req v15 + HH .req v16 + HH3 .req v17 + HH4 .req v18 + HH34 .req v19 + .text .arch armv8-a+crypto @@ -134,11 +147,25 @@ .endm .macro __pmull_pre_p64 + add x8, x3, #16 + ld1 {HH.2d-HH4.2d}, [x8] + + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d + eor SHASH2.16b, SHASH2.16b, T1.16b + + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b + movi MASK.16b, #0xe1 shl MASK.2d, MASK.2d, #57 .endm .macro __pmull_pre_p8 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + // k00_16 := 0x0000000000000000_000000000000ffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff movi k32_48.2d, #0xffffffff @@ -215,8 +242,6 @@ .macro __pmull_ghash, pn ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - eor SHASH2.16b, SHASH2.16b, SHASH.16b __pmull_pre_\pn @@ -224,12 +249,79 @@ cbz x4, 0f ld1 {T1.2d}, [x4] mov x4, xzr - b 1f + b 3f + +0: .ifc \pn, p64 + tbnz w0, #0, 2f // skip until #blocks is a + tbnz w0, #1, 2f // round multiple of 4 + +1: ld1 {XM3.16b-TT4.16b}, [x2], #64 + + sub w0, w0, #4 + + rev64 T1.16b, XM3.16b + rev64 T2.16b, XH3.16b + rev64 TT4.16b, TT4.16b + rev64 TT3.16b, TT3.16b + + ext IN1.16b, TT4.16b, TT4.16b, #8 + ext XL3.16b, TT3.16b, TT3.16b, #8 + + eor TT4.16b, TT4.16b, IN1.16b + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) + + eor TT3.16b, TT3.16b, XL3.16b + pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 + pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 + pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) + + ext IN1.16b, T2.16b, T2.16b, #8 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + eor T2.16b, T2.16b, IN1.16b + pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 + pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 + pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) -0: ld1 {T1.2d}, [x2], #16 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + ext IN1.16b, T1.16b, T1.16b, #8 + ext TT3.16b, XL.16b, XL.16b, #8 + eor XL.16b, XL.16b, IN1.16b + eor T1.16b, T1.16b, TT3.16b + + pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, HH4.1d, XL.1d // a0 * b0 + pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) + + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + eor XM.16b, XM.16b, XM2.16b + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbz w0, 5f + b 1b + .endif + +2: ld1 {T1.2d}, [x2], #16 sub w0, w0, #1 -1: /* multiply XL by SHASH in GF(2^128) */ +3: /* multiply XL by SHASH in GF(2^128) */ CPU_LE( rev64 T1.16b, T1.16b ) ext T2.16b, XL.16b, XL.16b, #8 @@ -242,7 +334,7 @@ CPU_LE( rev64 T1.16b, T1.16b ) __pmull_\pn XL, XL, SHASH // a0 * b0 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) - eor T2.16b, XL.16b, XH.16b +4: eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b @@ -253,7 +345,7 @@ CPU_LE( rev64 T1.16b, T1.16b ) cbnz w0, 0b - st1 {XL.2d}, [x1] +5: st1 {XL.2d}, [x1] ret .endm @@ -269,14 +361,10 @@ ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) - KS0 .req v8 - KS1 .req v9 - INP0 .req v10 - INP1 .req v11 - HH .req v12 - XL2 .req v13 - XM2 .req v14 - XH2 .req v15 + KS0 .req v12 + KS1 .req v13 + INP0 .req v14 + INP1 .req v15 .macro load_round_keys, rounds, rk cmp \rounds, #12 @@ -310,8 +398,8 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc - ld1 {HH.2d}, [x4], #16 - ld1 {SHASH.2d}, [x4] + ld1 {SHASH.2d}, [x4], #16 + ld1 {HH.2d}, [x4] ld1 {XL.2d}, [x1] ldr x8, [x5, #8] // load lower counter diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 3c2c446dc96c..6e9f33d14930 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -33,9 +33,12 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GCM_IV_SIZE 12 struct ghash_key { - u64 a; - u64 b; - be128 k; + u64 h[2]; + u64 h2[2]; + u64 h3[2]; + u64 h4[2]; + + be128 k; }; struct ghash_desc_ctx { @@ -46,7 +49,6 @@ struct ghash_desc_ctx { struct gcm_aes_ctx { struct crypto_aes_ctx aes_key; - u64 h2[2]; struct ghash_key ghash_key; }; @@ -63,11 +65,12 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, const char *head); asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], - const u8 src[], u64 const *k, u8 ctr[], - u32 const rk[], int rounds, u8 ks[]); + const u8 src[], struct ghash_key const *k, + u8 ctr[], u32 const rk[], int rounds, + u8 ks[]); asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], - const u8 src[], u64 const *k, + const u8 src[], struct ghash_key const *k, u8 ctr[], u32 const rk[], int rounds); asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], @@ -174,23 +177,36 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } +static void ghash_reflect(u64 h[], const be128 *k) +{ + u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0; + + h[0] = (be64_to_cpu(k->b) << 1) | carry; + h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63); + + if (carry) + h[1] ^= 0xc200000000000000UL; +} + static int __ghash_setkey(struct ghash_key *key, const u8 *inkey, unsigned int keylen) { - u64 a, b; + be128 h; /* needed for the fallback */ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); - /* perform multiplication by 'x' in GF(2^128) */ - b = get_unaligned_be64(inkey); - a = get_unaligned_be64(inkey + 8); + ghash_reflect(key->h, &key->k); + + h = key->k; + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h2, &h); - key->a = (a << 1) | (b >> 63); - key->b = (b << 1) | (a >> 63); + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h3, &h); - if (b >> 63) - key->b ^= 0xc200000000000000UL; + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h4, &h); return 0; } @@ -240,8 +256,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, unsigned int keylen) { struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); - be128 h1, h2; - u8 *key = (u8 *)&h1; + u8 key[GHASH_BLOCK_SIZE]; int ret; ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen); @@ -253,19 +268,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, num_rounds(&ctx->aes_key)); - __ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); - - /* calculate H^2 (used for 2-way aggregation) */ - h2 = h1; - gf128mul_lle(&h2, &h1); - - ctx->h2[0] = (be64_to_cpu(h2.b) << 1) | (be64_to_cpu(h2.a) >> 63); - ctx->h2[1] = (be64_to_cpu(h2.a) << 1) | (be64_to_cpu(h2.b) >> 63); - - if (be64_to_cpu(h2.a) >> 63) - ctx->h2[1] ^= 0xc200000000000000UL; - - return 0; + return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); } static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) @@ -401,8 +404,8 @@ static int gcm_encrypt(struct aead_request *req) kernel_neon_begin(); pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, - walk.src.virt.addr, ctx->h2, iv, - rk, nrounds, ks); + walk.src.virt.addr, &ctx->ghash_key, + iv, rk, nrounds, ks); kernel_neon_end(); err = skcipher_walk_done(&walk, @@ -512,8 +515,8 @@ static int gcm_decrypt(struct aead_request *req) kernel_neon_begin(); pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, - walk.src.virt.addr, ctx->h2, iv, - rk, nrounds); + walk.src.virt.addr, &ctx->ghash_key, + iv, rk, nrounds); /* check if this is the final iteration of the loop */ if (rem < (2 * AES_BLOCK_SIZE)) { -- cgit v1.2.3