From e50944e219f908968a6e01fbd0e8811a33bd5f04 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 30 Jun 2018 15:16:11 -0700
Subject: crypto: shash - remove useless setting of type flags

Many shash algorithms set .cra_flags = CRYPTO_ALG_TYPE_SHASH.  But this
is redundant with the C structure type ('struct shash_alg'), and
crypto_register_shash() already sets the type flag automatically,
clearing any type flag that was already there.  Apparently the useless
assignment has just been copy+pasted around.

So, remove the useless assignment from all the shash algorithms.

This patch shouldn't change any actual behavior.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/aes-glue.c       | 3 ---
 arch/arm64/crypto/ghash-ce-glue.c  | 1 -
 arch/arm64/crypto/sha1-ce-glue.c   | 1 -
 arch/arm64/crypto/sha2-ce-glue.c   | 2 --
 arch/arm64/crypto/sha256-glue.c    | 4 ----
 arch/arm64/crypto/sha3-ce-glue.c   | 4 ----
 arch/arm64/crypto/sha512-ce-glue.c | 2 --
 arch/arm64/crypto/sha512-glue.c    | 2 --
 arch/arm64/crypto/sm3-ce-glue.c    | 1 -
 9 files changed, 20 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 253188fb8cb0..a615a9a991a3 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -567,7 +567,6 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "cmac(aes)",
 	.base.cra_driver_name	= "cmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@@ -583,7 +582,6 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "xcbc(aes)",
 	.base.cra_driver_name	= "xcbc-aes-" MODE,
 	.base.cra_priority	= PRIO,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@@ -599,7 +597,6 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "cbcmac(aes)",
 	.base.cra_driver_name	= "cbcmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
 	.base.cra_module	= THIS_MODULE,
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 7cf0b1aa6ea8..757fae186753 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -204,7 +204,6 @@ static struct shash_alg ghash_alg = {
 	.base.cra_name		= "ghash",
 	.base.cra_driver_name	= "ghash-ce",
 	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct ghash_key),
 	.base.cra_module	= THIS_MODULE,
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index efbeb3e0dcfb..17fac2889f56 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -99,7 +99,6 @@ static struct shash_alg alg = {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-ce",
 		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index fd1ff2b13dfa..261f5195cab7 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -114,7 +114,6 @@ static struct shash_alg algs[] = { {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
 		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
@@ -129,7 +128,6 @@ static struct shash_alg algs[] = { {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
 		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index e8880ccdc71f..f1b4f4420ca1 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -68,7 +68,6 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha256",
 	.base.cra_driver_name	= "sha256-arm64",
 	.base.cra_priority	= 100,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -81,7 +80,6 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha224",
 	.base.cra_driver_name	= "sha224-arm64",
 	.base.cra_priority	= 100,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
@@ -153,7 +151,6 @@ static struct shash_alg neon_algs[] = { {
 	.base.cra_name		= "sha256",
 	.base.cra_driver_name	= "sha256-arm64-neon",
 	.base.cra_priority	= 150,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -166,7 +163,6 @@ static struct shash_alg neon_algs[] = { {
 	.base.cra_name		= "sha224",
 	.base.cra_driver_name	= "sha224-arm64-neon",
 	.base.cra_priority	= 150,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index da8222e528bd..a336feac0f59 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -105,7 +105,6 @@ static struct shash_alg algs[] = { {
 	.descsize		= sizeof(struct sha3_state),
 	.base.cra_name		= "sha3-224",
 	.base.cra_driver_name	= "sha3-224-ce",
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -117,7 +116,6 @@ static struct shash_alg algs[] = { {
 	.descsize		= sizeof(struct sha3_state),
 	.base.cra_name		= "sha3-256",
 	.base.cra_driver_name	= "sha3-256-ce",
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -129,7 +127,6 @@ static struct shash_alg algs[] = { {
 	.descsize		= sizeof(struct sha3_state),
 	.base.cra_name		= "sha3-384",
 	.base.cra_driver_name	= "sha3-384-ce",
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -141,7 +138,6 @@ static struct shash_alg algs[] = { {
 	.descsize		= sizeof(struct sha3_state),
 	.base.cra_name		= "sha3-512",
 	.base.cra_driver_name	= "sha3-512-ce",
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index a77c8632a589..f2c5f28c622a 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -87,7 +87,6 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-ce",
 	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -100,7 +99,6 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-ce",
 	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index 27db4851e380..325b23b43a9b 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -63,7 +63,6 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-arm64",
 	.base.cra_priority	= 150,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -76,7 +75,6 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-arm64",
 	.base.cra_priority	= 150,
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index 3b4948f7e26f..88938a20d9b2 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -72,7 +72,6 @@ static struct shash_alg sm3_alg = {
 	.descsize		= sizeof(struct sm3_state),
 	.base.cra_name		= "sm3",
 	.base.cra_driver_name	= "sm3-ce",
-	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SM3_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
-- 
cgit v1.2.3


From 6b0daa78207b622d768cce2a8251c830256bade9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 17 Jul 2018 10:09:26 -0700
Subject: crypto: arm64/sha256 - increase cra_priority of scalar
 implementations

Commit b73b7ac0a774 ("crypto: sha256_generic - add cra_priority") gave
sha256-generic and sha224-generic a cra_priority of 100, to match the
convention for generic implementations.  But sha256-arm64 and
sha224-arm64 also have priority 100, so their order relative to the
generic implementations became ambiguous.

Therefore, increase their priority to 125 so that they have higher
priority than the generic implementations but lower priority than the
NEON implementations which have priority 150.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/sha256-glue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index f1b4f4420ca1..4aedeaefd61f 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -67,7 +67,7 @@ static struct shash_alg algs[] = { {
 	.descsize		= sizeof(struct sha256_state),
 	.base.cra_name		= "sha256",
 	.base.cra_driver_name	= "sha256-arm64",
-	.base.cra_priority	= 100,
+	.base.cra_priority	= 125,
 	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -79,7 +79,7 @@ static struct shash_alg algs[] = { {
 	.descsize		= sizeof(struct sha256_state),
 	.base.cra_name		= "sha224",
 	.base.cra_driver_name	= "sha224-arm64",
-	.base.cra_priority	= 100,
+	.base.cra_priority	= 125,
 	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
-- 
cgit v1.2.3


From e4a1f7858ab834b2cdddfc51be2f5debae782029 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 23 Jul 2018 16:49:55 +0100
Subject: arm64: dts: hisi: add SEC crypto accelerator nodes for hip07 SoC

Enable all 4 SEC units available on d05 boards.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/boot/dts/hisilicon/hip07.dtsi | 284 +++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)

(limited to 'arch/arm64')

diff --git a/arch/arm64/boot/dts/hisilicon/hip07.dtsi b/arch/arm64/boot/dts/hisilicon/hip07.dtsi
index 9c10030a07f8..c33adefc3061 100644
--- a/arch/arm64/boot/dts/hisilicon/hip07.dtsi
+++ b/arch/arm64/boot/dts/hisilicon/hip07.dtsi
@@ -1049,7 +1049,74 @@
 			num-pins = <2>;
 		};
 	};
+	p0_mbigen_alg_a:interrupt-controller@d0080000 {
+		compatible = "hisilicon,mbigen-v2";
+		reg = <0x0 0xd0080000 0x0 0x10000>;
 
+		p0_mbigen_sec_a: intc_sec {
+			msi-parent = <&p0_its_dsa_a 0x40400>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <33>;
+		};
+		p0_mbigen_smmu_alg_a: intc_smmu_alg {
+			msi-parent = <&p0_its_dsa_a 0x40b1b>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <3>;
+		};
+	};
+	p0_mbigen_alg_b:interrupt-controller@8,d0080000 {
+		compatible = "hisilicon,mbigen-v2";
+		reg = <0x8 0xd0080000 0x0 0x10000>;
+
+		p0_mbigen_sec_b: intc_sec {
+			msi-parent = <&p0_its_dsa_b 0x42400>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <33>;
+		};
+		p0_mbigen_smmu_alg_b: intc_smmu_alg {
+			msi-parent = <&p0_its_dsa_b 0x42b1b>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <3>;
+		};
+	};
+	p1_mbigen_alg_a:interrupt-controller@400,d0080000 {
+		compatible = "hisilicon,mbigen-v2";
+		reg = <0x400 0xd0080000 0x0 0x10000>;
+
+		p1_mbigen_sec_a: intc_sec {
+			msi-parent = <&p1_its_dsa_a 0x44400>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <33>;
+		};
+		p1_mbigen_smmu_alg_a: intc_smmu_alg {
+			msi-parent = <&p1_its_dsa_a 0x44b1b>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <3>;
+		};
+	};
+	p1_mbigen_alg_b:interrupt-controller@408,d0080000 {
+		compatible = "hisilicon,mbigen-v2";
+		reg = <0x408 0xd0080000 0x0 0x10000>;
+
+		p1_mbigen_sec_b: intc_sec {
+			msi-parent = <&p1_its_dsa_b 0x46400>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <33>;
+		};
+		p1_mbigen_smmu_alg_b: intc_smmu_alg {
+			msi-parent = <&p1_its_dsa_b 0x46b1b>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			num-pins = <3>;
+		};
+	};
 	p0_mbigen_dsa_a: interrupt-controller@c0080000 {
 		compatible = "hisilicon,mbigen-v2";
 		reg = <0x0 0xc0080000 0x0 0x10000>;
@@ -1107,6 +1174,58 @@
 		hisilicon,broken-prefetch-cmd;
 		status = "disabled";
 	};
+	p0_smmu_alg_a: smmu_alg@d0040000 {
+		compatible = "arm,smmu-v3";
+		reg = <0x0 0xd0040000 0x0 0x20000>;
+		interrupt-parent = <&p0_mbigen_smmu_alg_a>;
+		interrupts = <733 1>,
+		<734 1>,
+		<735 1>;
+		interrupt-names = "eventq", "gerror", "priq";
+		#iommu-cells = <1>;
+		dma-coherent;
+		hisilicon,broken-prefetch-cmd;
+		/* smmu-cb-memtype = <0x0 0x1>;*/
+	};
+	p0_smmu_alg_b: smmu_alg@8,d0040000 {
+		compatible = "arm,smmu-v3";
+		reg = <0x8 0xd0040000 0x0 0x20000>;
+		interrupt-parent = <&p0_mbigen_smmu_alg_b>;
+		interrupts = <733 1>,
+		<734 1>,
+		<735 1>;
+		interrupt-names = "eventq", "gerror", "priq";
+		#iommu-cells = <1>;
+		dma-coherent;
+		hisilicon,broken-prefetch-cmd;
+		/* smmu-cb-memtype = <0x0 0x1>;*/
+	};
+	p1_smmu_alg_a: smmu_alg@400,d0040000 {
+		compatible = "arm,smmu-v3";
+		reg = <0x400 0xd0040000 0x0 0x20000>;
+		interrupt-parent = <&p1_mbigen_smmu_alg_a>;
+		interrupts = <733 1>,
+		<734 1>,
+		<735 1>;
+		interrupt-names = "eventq", "gerror", "priq";
+		#iommu-cells = <1>;
+		dma-coherent;
+		hisilicon,broken-prefetch-cmd;
+		/* smmu-cb-memtype = <0x0 0x1>;*/
+	};
+	p1_smmu_alg_b: smmu_alg@408,d0040000 {
+		compatible = "arm,smmu-v3";
+		reg = <0x408 0xd0040000 0x0 0x20000>;
+		interrupt-parent = <&p1_mbigen_smmu_alg_b>;
+		interrupts = <733 1>,
+		<734 1>,
+		<735 1>;
+		interrupt-names = "eventq", "gerror", "priq";
+		#iommu-cells = <1>;
+		dma-coherent;
+		hisilicon,broken-prefetch-cmd;
+		/* smmu-cb-memtype = <0x0 0x1>;*/
+	};
 
 	soc {
 		compatible = "simple-bus";
@@ -1603,5 +1722,170 @@
 					 0x0 0 0 4 &mbigen_pcie2_a 671 4>;
 			status = "disabled";
 		};
+		p0_sec_a: crypto@d2000000 {
+			compatible = "hisilicon,hip07-sec";
+			reg = <0x0 0xd0000000 0x0 0x10000
+			       0x0 0xd2000000 0x0 0x10000
+			       0x0 0xd2010000 0x0 0x10000
+			       0x0 0xd2020000 0x0 0x10000
+			       0x0 0xd2030000 0x0 0x10000
+			       0x0 0xd2040000 0x0 0x10000
+			       0x0 0xd2050000 0x0 0x10000
+			       0x0 0xd2060000 0x0 0x10000
+			       0x0 0xd2070000 0x0 0x10000
+			       0x0 0xd2080000 0x0 0x10000
+			       0x0 0xd2090000 0x0 0x10000
+			       0x0 0xd20a0000 0x0 0x10000
+			       0x0 0xd20b0000 0x0 0x10000
+			       0x0 0xd20c0000 0x0 0x10000
+			       0x0 0xd20d0000 0x0 0x10000
+			       0x0 0xd20e0000 0x0 0x10000
+			       0x0 0xd20f0000 0x0 0x10000
+			       0x0 0xd2100000 0x0 0x10000>;
+			interrupt-parent = <&p0_mbigen_sec_a>;
+			iommus = <&p0_smmu_alg_a 0x600>;
+			dma-coherent;
+			interrupts = <576 4>,
+				     <577 1>, <578 4>,
+				     <579 1>, <580 4>,
+				     <581 1>, <582 4>,
+				     <583 1>, <584 4>,
+				     <585 1>, <586 4>,
+				     <587 1>, <588 4>,
+				     <589 1>, <590 4>,
+				     <591 1>, <592 4>,
+				     <593 1>, <594 4>,
+				     <595 1>, <596 4>,
+				     <597 1>, <598 4>,
+				     <599 1>, <600 4>,
+				     <601 1>, <602 4>,
+				     <603 1>, <604 4>,
+				     <605 1>, <606 4>,
+				     <607 1>, <608 4>;
+		};
+		p0_sec_b: crypto@8,d2000000 {
+			compatible = "hisilicon,hip07-sec";
+			reg = <0x8 0xd0000000 0x0 0x10000
+			       0x8 0xd2000000 0x0 0x10000
+			       0x8 0xd2010000 0x0 0x10000
+			       0x8 0xd2020000 0x0 0x10000
+			       0x8 0xd2030000 0x0 0x10000
+			       0x8 0xd2040000 0x0 0x10000
+			       0x8 0xd2050000 0x0 0x10000
+			       0x8 0xd2060000 0x0 0x10000
+			       0x8 0xd2070000 0x0 0x10000
+			       0x8 0xd2080000 0x0 0x10000
+			       0x8 0xd2090000 0x0 0x10000
+			       0x8 0xd20a0000 0x0 0x10000
+			       0x8 0xd20b0000 0x0 0x10000
+			       0x8 0xd20c0000 0x0 0x10000
+			       0x8 0xd20d0000 0x0 0x10000
+			       0x8 0xd20e0000 0x0 0x10000
+			       0x8 0xd20f0000 0x0 0x10000
+			       0x8 0xd2100000 0x0 0x10000>;
+			interrupt-parent = <&p0_mbigen_sec_b>;
+			iommus = <&p0_smmu_alg_b 0x600>;
+			dma-coherent;
+			interrupts = <576 4>,
+				     <577 1>, <578 4>,
+				     <579 1>, <580 4>,
+				     <581 1>, <582 4>,
+				     <583 1>, <584 4>,
+				     <585 1>, <586 4>,
+				     <587 1>, <588 4>,
+				     <589 1>, <590 4>,
+				     <591 1>, <592 4>,
+				     <593 1>, <594 4>,
+				     <595 1>, <596 4>,
+				     <597 1>, <598 4>,
+				     <599 1>, <600 4>,
+				     <601 1>, <602 4>,
+				     <603 1>, <604 4>,
+				     <605 1>, <606 4>,
+				     <607 1>, <608 4>;
+		};
+		p1_sec_a: crypto@400,d2000000 {
+			compatible = "hisilicon,hip07-sec";
+			reg = <0x400 0xd0000000 0x0 0x10000
+			       0x400 0xd2000000 0x0 0x10000
+			       0x400 0xd2010000 0x0 0x10000
+			       0x400 0xd2020000 0x0 0x10000
+			       0x400 0xd2030000 0x0 0x10000
+			       0x400 0xd2040000 0x0 0x10000
+			       0x400 0xd2050000 0x0 0x10000
+			       0x400 0xd2060000 0x0 0x10000
+			       0x400 0xd2070000 0x0 0x10000
+			       0x400 0xd2080000 0x0 0x10000
+			       0x400 0xd2090000 0x0 0x10000
+			       0x400 0xd20a0000 0x0 0x10000
+			       0x400 0xd20b0000 0x0 0x10000
+			       0x400 0xd20c0000 0x0 0x10000
+			       0x400 0xd20d0000 0x0 0x10000
+			       0x400 0xd20e0000 0x0 0x10000
+			       0x400 0xd20f0000 0x0 0x10000
+			       0x400 0xd2100000 0x0 0x10000>;
+			interrupt-parent = <&p1_mbigen_sec_a>;
+			iommus = <&p1_smmu_alg_a 0x600>;
+			dma-coherent;
+			interrupts = <576 4>,
+				     <577 1>, <578 4>,
+				     <579 1>, <580 4>,
+				     <581 1>, <582 4>,
+				     <583 1>, <584 4>,
+				     <585 1>, <586 4>,
+				     <587 1>, <588 4>,
+				     <589 1>, <590 4>,
+				     <591 1>, <592 4>,
+				     <593 1>, <594 4>,
+				     <595 1>, <596 4>,
+				     <597 1>, <598 4>,
+				     <599 1>, <600 4>,
+				     <601 1>, <602 4>,
+				     <603 1>, <604 4>,
+				     <605 1>, <606 4>,
+				     <607 1>, <608 4>;
+		};
+		p1_sec_b: crypto@408,d2000000 {
+			compatible = "hisilicon,hip07-sec";
+			reg = <0x408 0xd0000000 0x0 0x10000
+			       0x408 0xd2000000 0x0 0x10000
+			       0x408 0xd2010000 0x0 0x10000
+			       0x408 0xd2020000 0x0 0x10000
+			       0x408 0xd2030000 0x0 0x10000
+			       0x408 0xd2040000 0x0 0x10000
+			       0x408 0xd2050000 0x0 0x10000
+			       0x408 0xd2060000 0x0 0x10000
+			       0x408 0xd2070000 0x0 0x10000
+			       0x408 0xd2080000 0x0 0x10000
+			       0x408 0xd2090000 0x0 0x10000
+			       0x408 0xd20a0000 0x0 0x10000
+			       0x408 0xd20b0000 0x0 0x10000
+			       0x408 0xd20c0000 0x0 0x10000
+			       0x408 0xd20d0000 0x0 0x10000
+			       0x408 0xd20e0000 0x0 0x10000
+			       0x408 0xd20f0000 0x0 0x10000
+			       0x408 0xd2100000 0x0 0x10000>;
+			interrupt-parent = <&p1_mbigen_sec_b>;
+			iommus = <&p1_smmu_alg_b 0x600>;
+			dma-coherent;
+			interrupts = <576 4>,
+				     <577 1>, <578 4>,
+				     <579 1>, <580 4>,
+				     <581 1>, <582 4>,
+				     <583 1>, <584 4>,
+				     <585 1>, <586 4>,
+				     <587 1>, <588 4>,
+				     <589 1>, <590 4>,
+				     <591 1>, <592 4>,
+				     <593 1>, <594 4>,
+				     <595 1>, <596 4>,
+				     <597 1>, <598 4>,
+				     <599 1>, <600 4>,
+				     <601 1>, <602 4>,
+				     <603 1>, <604 4>,
+				     <605 1>, <606 4>,
+				     <607 1>, <608 4>;
+		};
+
 	};
 };
-- 
cgit v1.2.3


From 71e52c278c54db10e368c54687234390357b08d6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Jul 2018 23:06:40 +0200
Subject: crypto: arm64/aes-ce-gcm - operate on two input blocks at a time

Update the core AES/GCM transform and the associated plumbing to operate
on 2 AES/GHASH blocks at a time. By itself, this is not expected to
result in a noticeable speedup, but it paves the way for reimplementing
the GHASH component using 2-way aggregation.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/ghash-ce-core.S | 127 +++++++++++++++++++++++++++++---------
 arch/arm64/crypto/ghash-ce-glue.c | 103 +++++++++++++++++++------------
 2 files changed, 161 insertions(+), 69 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index c723647b37db..dac0df29d194 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -286,9 +286,10 @@ ENTRY(pmull_ghash_update_p8)
 	__pmull_ghash	p8
 ENDPROC(pmull_ghash_update_p8)
 
-	KS		.req	v8
-	CTR		.req	v9
-	INP		.req	v10
+	KS0		.req	v8
+	KS1		.req	v9
+	INP0		.req	v10
+	INP1		.req	v11
 
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
@@ -336,84 +337,146 @@ CPU_LE(	rev		x8, x8		)
 
 	.if		\enc == 1
 	ldr		x10, [sp]
-	ld1		{KS.16b}, [x10]
+	ld1		{KS0.16b-KS1.16b}, [x10]
 	.endif
 
-0:	ld1		{CTR.8b}, [x5]			// load upper counter
-	ld1		{INP.16b}, [x3], #16
+0:	ld1		{INP0.16b-INP1.16b}, [x3], #32
+
 	rev		x9, x8
-	add		x8, x8, #1
-	sub		w0, w0, #1
-	ins		CTR.d[1], x9			// set lower counter
+	add		x11, x8, #1
+	add		x8, x8, #2
 
 	.if		\enc == 1
-	eor		INP.16b, INP.16b, KS.16b	// encrypt input
-	st1		{INP.16b}, [x2], #16
+	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
+	eor		INP1.16b, INP1.16b, KS1.16b
 	.endif
 
-	rev64		T1.16b, INP.16b
+	ld1		{KS0.8b}, [x5]			// load upper counter
+	rev		x11, x11
+	sub		w0, w0, #2
+	mov		KS1.8b, KS0.8b
+	ins		KS0.d[1], x9			// set lower counter
+	ins		KS1.d[1], x11
+
+	rev64		T1.16b, INP0.16b
 
 	cmp		w7, #12
 	b.ge		2f				// AES-192/256?
 
-1:	enc_round	CTR, v21
+1:	enc_round	KS0, v21
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
 
-	enc_round	CTR, v22
+	enc_round	KS1, v21
 
 	eor		T1.16b, T1.16b, T2.16b
 	eor		XL.16b, XL.16b, IN1.16b
 
-	enc_round	CTR, v23
+	enc_round	KS0, v22
 
 	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
 	eor		T1.16b, T1.16b, XL.16b
 
-	enc_round	CTR, v24
+	enc_round	KS1, v22
 
 	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
 	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
 
-	enc_round	CTR, v25
+	enc_round	KS0, v23
 
 	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		T2.16b, XL.16b, XH.16b
 	eor		XM.16b, XM.16b, T1.16b
 
-	enc_round	CTR, v26
+	enc_round	KS1, v23
 
 	eor		XM.16b, XM.16b, T2.16b
 	pmull		T2.1q, XL.1d, MASK.1d
 
-	enc_round	CTR, v27
+	enc_round	KS0, v24
 
 	mov		XH.d[0], XM.d[1]
 	mov		XM.d[1], XL.d[0]
 
-	enc_round	CTR, v28
+	enc_round	KS1, v24
 
 	eor		XL.16b, XM.16b, T2.16b
 
-	enc_round	CTR, v29
+	enc_round	KS0, v25
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 
-	aese		CTR.16b, v30.16b
+	enc_round	KS1, v25
 
 	pmull		XL.1q, XL.1d, MASK.1d
 	eor		T2.16b, T2.16b, XH.16b
 
-	eor		KS.16b, CTR.16b, v31.16b
+	enc_round	KS0, v26
+
+	eor		XL.16b, XL.16b, T2.16b
+	rev64		T1.16b, INP1.16b
+
+	enc_round	KS1, v26
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+	ext		IN1.16b, T1.16b, T1.16b, #8
+
+	enc_round	KS0, v27
+
+	eor		T1.16b, T1.16b, T2.16b
+	eor		XL.16b, XL.16b, IN1.16b
+
+	enc_round	KS1, v27
+
+	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
+	eor		T1.16b, T1.16b, XL.16b
+
+	enc_round	KS0, v28
+
+	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
+	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
+
+	enc_round	KS1, v28
+
+	ext		T1.16b, XL.16b, XH.16b, #8
+	eor		T2.16b, XL.16b, XH.16b
+	eor		XM.16b, XM.16b, T1.16b
+
+	enc_round	KS0, v29
+
+	eor		XM.16b, XM.16b, T2.16b
+	pmull		T2.1q, XL.1d, MASK.1d
+
+	enc_round	KS1, v29
+
+	mov		XH.d[0], XM.d[1]
+	mov		XM.d[1], XL.d[0]
+
+	aese		KS0.16b, v30.16b
+
+	eor		XL.16b, XM.16b, T2.16b
+
+	aese		KS1.16b, v30.16b
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+
+	eor		KS0.16b, KS0.16b, v31.16b
+
+	pmull		XL.1q, XL.1d, MASK.1d
+	eor		T2.16b, T2.16b, XH.16b
+
+	eor		KS1.16b, KS1.16b, v31.16b
 
 	eor		XL.16b, XL.16b, T2.16b
 
 	.if		\enc == 0
-	eor		INP.16b, INP.16b, KS.16b
-	st1		{INP.16b}, [x2], #16
+	eor		INP0.16b, INP0.16b, KS0.16b
+	eor		INP1.16b, INP1.16b, KS1.16b
 	.endif
 
+	st1		{INP0.16b-INP1.16b}, [x2], #32
+
 	cbnz		w0, 0b
 
 CPU_LE(	rev		x8, x8		)
@@ -421,16 +484,20 @@ CPU_LE(	rev		x8, x8		)
 	str		x8, [x5, #8]			// store lower counter
 
 	.if		\enc == 1
-	st1		{KS.16b}, [x10]
+	st1		{KS0.16b-KS1.16b}, [x10]
 	.endif
 
 	ret
 
 2:	b.eq		3f				// AES-192?
-	enc_round	CTR, v17
-	enc_round	CTR, v18
-3:	enc_round	CTR, v19
-	enc_round	CTR, v20
+	enc_round	KS0, v17
+	enc_round	KS1, v17
+	enc_round	KS0, v18
+	enc_round	KS1, v18
+3:	enc_round	KS0, v19
+	enc_round	KS1, v19
+	enc_round	KS0, v20
+	enc_round	KS1, v20
 	b		1b
 	.endm
 
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 18b4d1b96a7e..8ff6732c4fb5 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -348,9 +348,10 @@ static int gcm_encrypt(struct aead_request *req)
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
 	struct skcipher_walk walk;
 	u8 iv[AES_BLOCK_SIZE];
-	u8 ks[AES_BLOCK_SIZE];
+	u8 ks[2 * AES_BLOCK_SIZE];
 	u8 tag[AES_BLOCK_SIZE];
 	u64 dg[2] = {};
+	int nrounds = num_rounds(&ctx->aes_key);
 	int err;
 
 	if (req->assoclen)
@@ -362,32 +363,31 @@ static int gcm_encrypt(struct aead_request *req)
 	if (likely(may_use_simd())) {
 		kernel_neon_begin();
 
-		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
-					num_rounds(&ctx->aes_key));
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
-		pmull_gcm_encrypt_block(ks, iv, NULL,
-					num_rounds(&ctx->aes_key));
+		pmull_gcm_encrypt_block(ks, iv, NULL, nrounds);
 		put_unaligned_be32(3, iv + GCM_IV_SIZE);
+		pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds);
+		put_unaligned_be32(4, iv + GCM_IV_SIZE);
 		kernel_neon_end();
 
 		err = skcipher_walk_aead_encrypt(&walk, req, false);
 
-		while (walk.nbytes >= AES_BLOCK_SIZE) {
-			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		while (walk.nbytes >= 2 * AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 
 			kernel_neon_begin();
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, ctx->aes_key.key_enc,
-					  num_rounds(&ctx->aes_key), ks);
+					  iv, ctx->aes_key.key_enc, nrounds,
+					  ks);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
-						 walk.nbytes % AES_BLOCK_SIZE);
+					walk.nbytes % (2 * AES_BLOCK_SIZE));
 		}
 	} else {
-		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
-				    num_rounds(&ctx->aes_key));
+		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 		err = skcipher_walk_aead_encrypt(&walk, req, false);
@@ -399,8 +399,7 @@ static int gcm_encrypt(struct aead_request *req)
 
 			do {
 				__aes_arm64_encrypt(ctx->aes_key.key_enc,
-						    ks, iv,
-						    num_rounds(&ctx->aes_key));
+						    ks, iv, nrounds);
 				crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
 				crypto_inc(iv, AES_BLOCK_SIZE);
 
@@ -417,19 +416,28 @@ static int gcm_encrypt(struct aead_request *req)
 		}
 		if (walk.nbytes)
 			__aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv,
-					    num_rounds(&ctx->aes_key));
+					    nrounds);
 	}
 
 	/* handle the tail */
 	if (walk.nbytes) {
 		u8 buf[GHASH_BLOCK_SIZE];
+		unsigned int nbytes = walk.nbytes;
+		u8 *dst = walk.dst.virt.addr;
+		u8 *head = NULL;
 
 		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
 			       walk.nbytes);
 
-		memcpy(buf, walk.dst.virt.addr, walk.nbytes);
-		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
-		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+		if (walk.nbytes > GHASH_BLOCK_SIZE) {
+			head = dst;
+			dst += GHASH_BLOCK_SIZE;
+			nbytes %= GHASH_BLOCK_SIZE;
+		}
+
+		memcpy(buf, dst, nbytes);
+		memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
+		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
 
 		err = skcipher_walk_done(&walk, 0);
 	}
@@ -452,10 +460,11 @@ static int gcm_decrypt(struct aead_request *req)
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
 	unsigned int authsize = crypto_aead_authsize(aead);
 	struct skcipher_walk walk;
-	u8 iv[AES_BLOCK_SIZE];
+	u8 iv[2 * AES_BLOCK_SIZE];
 	u8 tag[AES_BLOCK_SIZE];
-	u8 buf[GHASH_BLOCK_SIZE];
+	u8 buf[2 * GHASH_BLOCK_SIZE];
 	u64 dg[2] = {};
+	int nrounds = num_rounds(&ctx->aes_key);
 	int err;
 
 	if (req->assoclen)
@@ -466,37 +475,44 @@ static int gcm_decrypt(struct aead_request *req)
 
 	if (likely(may_use_simd())) {
 		kernel_neon_begin();
-
-		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
-					num_rounds(&ctx->aes_key));
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 		kernel_neon_end();
 
 		err = skcipher_walk_aead_decrypt(&walk, req, false);
 
-		while (walk.nbytes >= AES_BLOCK_SIZE) {
-			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		while (walk.nbytes >= 2 * AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 
 			kernel_neon_begin();
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, ctx->aes_key.key_enc,
-					  num_rounds(&ctx->aes_key));
+					  iv, ctx->aes_key.key_enc, nrounds);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
-						 walk.nbytes % AES_BLOCK_SIZE);
+					walk.nbytes % (2 * AES_BLOCK_SIZE));
 		}
+
 		if (walk.nbytes) {
+			u8 *iv2 = iv + AES_BLOCK_SIZE;
+
+			if (walk.nbytes > AES_BLOCK_SIZE) {
+				memcpy(iv2, iv, AES_BLOCK_SIZE);
+				crypto_inc(iv2, AES_BLOCK_SIZE);
+			}
+
 			kernel_neon_begin();
 			pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc,
-						num_rounds(&ctx->aes_key));
+						nrounds);
+
+			if (walk.nbytes > AES_BLOCK_SIZE)
+				pmull_gcm_encrypt_block(iv2, iv2, NULL,
+							nrounds);
 			kernel_neon_end();
 		}
-
 	} else {
-		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
-				    num_rounds(&ctx->aes_key));
+		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 		err = skcipher_walk_aead_decrypt(&walk, req, false);
@@ -511,8 +527,7 @@ static int gcm_decrypt(struct aead_request *req)
 
 			do {
 				__aes_arm64_encrypt(ctx->aes_key.key_enc,
-						    buf, iv,
-						    num_rounds(&ctx->aes_key));
+						    buf, iv, nrounds);
 				crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
 				crypto_inc(iv, AES_BLOCK_SIZE);
 
@@ -525,14 +540,24 @@ static int gcm_decrypt(struct aead_request *req)
 		}
 		if (walk.nbytes)
 			__aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv,
-					    num_rounds(&ctx->aes_key));
+					    nrounds);
 	}
 
 	/* handle the tail */
 	if (walk.nbytes) {
-		memcpy(buf, walk.src.virt.addr, walk.nbytes);
-		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
-		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+		const u8 *src = walk.src.virt.addr;
+		const u8 *head = NULL;
+		unsigned int nbytes = walk.nbytes;
+
+		if (walk.nbytes > GHASH_BLOCK_SIZE) {
+			head = src;
+			src += GHASH_BLOCK_SIZE;
+			nbytes %= GHASH_BLOCK_SIZE;
+		}
+
+		memcpy(buf, src, nbytes);
+		memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
+		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
 
 		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
 			       walk.nbytes);
@@ -557,7 +582,7 @@ static int gcm_decrypt(struct aead_request *req)
 
 static struct aead_alg gcm_aes_alg = {
 	.ivsize			= GCM_IV_SIZE,
-	.chunksize		= AES_BLOCK_SIZE,
+	.chunksize		= 2 * AES_BLOCK_SIZE,
 	.maxauthsize		= AES_BLOCK_SIZE,
 	.setkey			= gcm_setkey,
 	.setauthsize		= gcm_setauthsize,
-- 
cgit v1.2.3


From e0bd888dc487e0c444ee5f3bf55020862d16a225 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Jul 2018 23:06:41 +0200
Subject: crypto: arm64/aes-ce-gcm - implement 2-way aggregation

Implement a faster version of the GHASH transform which amortizes
the reduction modulo the characteristic polynomial across two
input blocks at a time.

On a Cortex-A53, the gcm(aes) performance increases 24%, from
3.0 cycles per byte to 2.4 cpb for large input sizes.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/ghash-ce-core.S | 86 +++++++++++++--------------------------
 arch/arm64/crypto/ghash-ce-glue.c | 34 +++++++++++-----
 2 files changed, 52 insertions(+), 68 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index dac0df29d194..f7281e7a592f 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -290,6 +290,10 @@ ENDPROC(pmull_ghash_update_p8)
 	KS1		.req	v9
 	INP0		.req	v10
 	INP1		.req	v11
+	HH		.req	v12
+	XL2		.req	v13
+	XM2		.req	v14
+	XH2		.req	v15
 
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
@@ -323,6 +327,7 @@ ENDPROC(pmull_ghash_update_p8)
 	.endm
 
 	.macro		pmull_gcm_do_crypt, enc
+	ld1		{HH.2d}, [x4], #16
 	ld1		{SHASH.2d}, [x4]
 	ld1		{XL.2d}, [x1]
 	ldr		x8, [x5, #8]			// load lower counter
@@ -330,10 +335,11 @@ ENDPROC(pmull_ghash_update_p8)
 	load_round_keys	w7, x6
 
 	movi		MASK.16b, #0xe1
-	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+	trn1		SHASH2.2d, SHASH.2d, HH.2d
+	trn2		T1.2d, SHASH.2d, HH.2d
 CPU_LE(	rev		x8, x8		)
 	shl		MASK.2d, MASK.2d, #57
-	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
+	eor		SHASH2.16b, SHASH2.16b, T1.16b
 
 	.if		\enc == 1
 	ldr		x10, [sp]
@@ -358,116 +364,82 @@ CPU_LE(	rev		x8, x8		)
 	ins		KS0.d[1], x9			// set lower counter
 	ins		KS1.d[1], x11
 
-	rev64		T1.16b, INP0.16b
+	rev64		T1.16b, INP1.16b
 
 	cmp		w7, #12
 	b.ge		2f				// AES-192/256?
 
 1:	enc_round	KS0, v21
-
-	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
 
 	enc_round	KS1, v21
-
-	eor		T1.16b, T1.16b, T2.16b
-	eor		XL.16b, XL.16b, IN1.16b
+	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
 
 	enc_round	KS0, v22
-
-	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
-	eor		T1.16b, T1.16b, XL.16b
+	eor		T1.16b, T1.16b, IN1.16b
 
 	enc_round	KS1, v22
-
-	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
-	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
+	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
 
 	enc_round	KS0, v23
-
-	ext		T1.16b, XL.16b, XH.16b, #8
-	eor		T2.16b, XL.16b, XH.16b
-	eor		XM.16b, XM.16b, T1.16b
+	pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0)
 
 	enc_round	KS1, v23
-
-	eor		XM.16b, XM.16b, T2.16b
-	pmull		T2.1q, XL.1d, MASK.1d
+	rev64		T1.16b, INP0.16b
+	ext		T2.16b, XL.16b, XL.16b, #8
 
 	enc_round	KS0, v24
-
-	mov		XH.d[0], XM.d[1]
-	mov		XM.d[1], XL.d[0]
+	ext		IN1.16b, T1.16b, T1.16b, #8
+	eor		T1.16b, T1.16b, T2.16b
 
 	enc_round	KS1, v24
-
-	eor		XL.16b, XM.16b, T2.16b
+	eor		XL.16b, XL.16b, IN1.16b
 
 	enc_round	KS0, v25
-
-	ext		T2.16b, XL.16b, XL.16b, #8
+	eor		T1.16b, T1.16b, XL.16b
 
 	enc_round	KS1, v25
-
-	pmull		XL.1q, XL.1d, MASK.1d
-	eor		T2.16b, T2.16b, XH.16b
+	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
 
 	enc_round	KS0, v26
-
-	eor		XL.16b, XL.16b, T2.16b
-	rev64		T1.16b, INP1.16b
+	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
 
 	enc_round	KS1, v26
-
-	ext		T2.16b, XL.16b, XL.16b, #8
-	ext		IN1.16b, T1.16b, T1.16b, #8
+	pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0)
 
 	enc_round	KS0, v27
-
-	eor		T1.16b, T1.16b, T2.16b
-	eor		XL.16b, XL.16b, IN1.16b
+	eor		XL.16b, XL.16b, XL2.16b
+	eor		XH.16b, XH.16b, XH2.16b
 
 	enc_round	KS1, v27
-
-	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
-	eor		T1.16b, T1.16b, XL.16b
+	eor		XM.16b, XM.16b, XM2.16b
+	ext		T1.16b, XL.16b, XH.16b, #8
 
 	enc_round	KS0, v28
-
-	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
-	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
-
-	enc_round	KS1, v28
-
-	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		T2.16b, XL.16b, XH.16b
 	eor		XM.16b, XM.16b, T1.16b
 
-	enc_round	KS0, v29
-
+	enc_round	KS1, v28
 	eor		XM.16b, XM.16b, T2.16b
+
+	enc_round	KS0, v29
 	pmull		T2.1q, XL.1d, MASK.1d
 
 	enc_round	KS1, v29
-
 	mov		XH.d[0], XM.d[1]
 	mov		XM.d[1], XL.d[0]
 
 	aese		KS0.16b, v30.16b
-
 	eor		XL.16b, XM.16b, T2.16b
 
 	aese		KS1.16b, v30.16b
-
 	ext		T2.16b, XL.16b, XL.16b, #8
 
 	eor		KS0.16b, KS0.16b, v31.16b
-
 	pmull		XL.1q, XL.1d, MASK.1d
 	eor		T2.16b, T2.16b, XH.16b
 
 	eor		KS1.16b, KS1.16b, v31.16b
-
 	eor		XL.16b, XL.16b, T2.16b
 
 	.if		\enc == 0
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 8ff6732c4fb5..cd91b146c87d 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -46,6 +46,7 @@ struct ghash_desc_ctx {
 
 struct gcm_aes_ctx {
 	struct crypto_aes_ctx	aes_key;
+	u64			h2[2];
 	struct ghash_key	ghash_key;
 };
 
@@ -62,12 +63,11 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
 				  const char *head);
 
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
-				  const u8 src[], struct ghash_key const *k,
-				  u8 ctr[], u32 const rk[], int rounds,
-				  u8 ks[]);
+				  const u8 src[], u64 const *k, u8 ctr[],
+				  u32 const rk[], int rounds, u8 ks[]);
 
 asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
-				  const u8 src[], struct ghash_key const *k,
+				  const u8 src[], u64 const *k,
 				  u8 ctr[], u32 const rk[], int rounds);
 
 asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
@@ -232,7 +232,8 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 		      unsigned int keylen)
 {
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
-	u8 key[GHASH_BLOCK_SIZE];
+	be128 h1, h2;
+	u8 *key = (u8 *)&h1;
 	int ret;
 
 	ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
@@ -244,7 +245,19 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 	__aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
 			    num_rounds(&ctx->aes_key));
 
-	return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
+	__ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
+
+	/* calculate H^2 (used for 2-way aggregation) */
+	h2 = h1;
+	gf128mul_lle(&h2, &h1);
+
+	ctx->h2[0] = (be64_to_cpu(h2.b) << 1) | (be64_to_cpu(h2.a) >> 63);
+	ctx->h2[1] = (be64_to_cpu(h2.a) << 1) | (be64_to_cpu(h2.b) >> 63);
+
+	if (be64_to_cpu(h2.a) >> 63)
+		ctx->h2[1] ^= 0xc200000000000000UL;
+
+	return 0;
 }
 
 static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -378,9 +391,8 @@ static int gcm_encrypt(struct aead_request *req)
 
 			kernel_neon_begin();
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
-					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, ctx->aes_key.key_enc, nrounds,
-					  ks);
+					  walk.src.virt.addr, ctx->h2, iv,
+					  ctx->aes_key.key_enc, nrounds, ks);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
@@ -486,8 +498,8 @@ static int gcm_decrypt(struct aead_request *req)
 
 			kernel_neon_begin();
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
-					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, ctx->aes_key.key_enc, nrounds);
+					  walk.src.virt.addr, ctx->h2, iv,
+					  ctx->aes_key.key_enc, nrounds);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
-- 
cgit v1.2.3


From 30f1a9f53e77e4c9ddf55ebfda8a9d7666e46964 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Jul 2018 23:06:42 +0200
Subject: crypto: arm64/aes-ce-gcm - don't reload key schedule if avoidable

Squeeze out another 5% of performance by minimizing the number
of invocations of kernel_neon_begin()/kernel_neon_end() on the
common path, which also allows some reloads of the key schedule
to be optimized away.

The resulting code runs at 2.3 cycles per byte on a Cortex-A53.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/ghash-ce-core.S |  9 +++--
 arch/arm64/crypto/ghash-ce-glue.c | 81 +++++++++++++++++++++------------------
 2 files changed, 49 insertions(+), 41 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index f7281e7a592f..913e49932ae6 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -1,7 +1,7 @@
 /*
  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  *
- * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -332,8 +332,6 @@ ENDPROC(pmull_ghash_update_p8)
 	ld1		{XL.2d}, [x1]
 	ldr		x8, [x5, #8]			// load lower counter
 
-	load_round_keys	w7, x6
-
 	movi		MASK.16b, #0xe1
 	trn1		SHASH2.2d, SHASH.2d, HH.2d
 	trn2		T1.2d, SHASH.2d, HH.2d
@@ -346,6 +344,8 @@ CPU_LE(	rev		x8, x8		)
 	ld1		{KS0.16b-KS1.16b}, [x10]
 	.endif
 
+	cbnz		x6, 4f
+
 0:	ld1		{INP0.16b-INP1.16b}, [x3], #32
 
 	rev		x9, x8
@@ -471,6 +471,9 @@ CPU_LE(	rev		x8, x8		)
 	enc_round	KS0, v20
 	enc_round	KS1, v20
 	b		1b
+
+4:	load_round_keys	w7, x6
+	b		0b
 	.endm
 
 	/*
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index cd91b146c87d..42a0e84e276c 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -1,7 +1,7 @@
 /*
  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  *
- * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -373,37 +373,39 @@ static int gcm_encrypt(struct aead_request *req)
 	memcpy(iv, req->iv, GCM_IV_SIZE);
 	put_unaligned_be32(1, iv + GCM_IV_SIZE);
 
-	if (likely(may_use_simd())) {
-		kernel_neon_begin();
+	err = skcipher_walk_aead_encrypt(&walk, req, false);
 
+	if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) {
+		u32 const *rk = NULL;
+
+		kernel_neon_begin();
 		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 		pmull_gcm_encrypt_block(ks, iv, NULL, nrounds);
 		put_unaligned_be32(3, iv + GCM_IV_SIZE);
 		pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds);
 		put_unaligned_be32(4, iv + GCM_IV_SIZE);
-		kernel_neon_end();
-
-		err = skcipher_walk_aead_encrypt(&walk, req, false);
 
-		while (walk.nbytes >= 2 * AES_BLOCK_SIZE) {
+		do {
 			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 
-			kernel_neon_begin();
+			if (rk)
+				kernel_neon_begin();
+
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, ctx->h2, iv,
-					  ctx->aes_key.key_enc, nrounds, ks);
+					  rk, nrounds, ks);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
 					walk.nbytes % (2 * AES_BLOCK_SIZE));
-		}
+
+			rk = ctx->aes_key.key_enc;
+		} while (walk.nbytes >= 2 * AES_BLOCK_SIZE);
 	} else {
 		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
-		err = skcipher_walk_aead_encrypt(&walk, req, false);
-
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
 			u8 *dst = walk.dst.virt.addr;
@@ -485,50 +487,53 @@ static int gcm_decrypt(struct aead_request *req)
 	memcpy(iv, req->iv, GCM_IV_SIZE);
 	put_unaligned_be32(1, iv + GCM_IV_SIZE);
 
-	if (likely(may_use_simd())) {
+	err = skcipher_walk_aead_decrypt(&walk, req, false);
+
+	if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) {
+		u32 const *rk = NULL;
+
 		kernel_neon_begin();
 		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
-		kernel_neon_end();
 
-		err = skcipher_walk_aead_decrypt(&walk, req, false);
-
-		while (walk.nbytes >= 2 * AES_BLOCK_SIZE) {
+		do {
 			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
+			int rem = walk.total - blocks * AES_BLOCK_SIZE;
+
+			if (rk)
+				kernel_neon_begin();
 
-			kernel_neon_begin();
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, ctx->h2, iv,
-					  ctx->aes_key.key_enc, nrounds);
-			kernel_neon_end();
+					  rk, nrounds);
 
-			err = skcipher_walk_done(&walk,
-					walk.nbytes % (2 * AES_BLOCK_SIZE));
-		}
+			/* check if this is the final iteration of the loop */
+			if (rem < (2 * AES_BLOCK_SIZE)) {
+				u8 *iv2 = iv + AES_BLOCK_SIZE;
 
-		if (walk.nbytes) {
-			u8 *iv2 = iv + AES_BLOCK_SIZE;
+				if (rem > AES_BLOCK_SIZE) {
+					memcpy(iv2, iv, AES_BLOCK_SIZE);
+					crypto_inc(iv2, AES_BLOCK_SIZE);
+				}
 
-			if (walk.nbytes > AES_BLOCK_SIZE) {
-				memcpy(iv2, iv, AES_BLOCK_SIZE);
-				crypto_inc(iv2, AES_BLOCK_SIZE);
-			}
+				pmull_gcm_encrypt_block(iv, iv, NULL, nrounds);
 
-			kernel_neon_begin();
-			pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc,
-						nrounds);
+				if (rem > AES_BLOCK_SIZE)
+					pmull_gcm_encrypt_block(iv2, iv2, NULL,
+								nrounds);
+			}
 
-			if (walk.nbytes > AES_BLOCK_SIZE)
-				pmull_gcm_encrypt_block(iv2, iv2, NULL,
-							nrounds);
 			kernel_neon_end();
-		}
+
+			err = skcipher_walk_done(&walk,
+					walk.nbytes % (2 * AES_BLOCK_SIZE));
+
+			rk = ctx->aes_key.key_enc;
+		} while (walk.nbytes >= 2 * AES_BLOCK_SIZE);
 	} else {
 		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
-		err = skcipher_walk_aead_decrypt(&walk, req, false);
-
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
 			u8 *dst = walk.dst.virt.addr;
-- 
cgit v1.2.3


From 8e492eff7de955e6ed1dc2989b17c41cd862aa28 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 4 Aug 2018 20:46:24 +0200
Subject: crypto: arm64/ghash-ce - replace NEON yield check with block limit

Checking the TIF_NEED_RESCHED flag is disproportionately costly on cores
with fast crypto instructions and comparatively slow memory accesses.

On algorithms such as GHASH, which executes at ~1 cycle per byte on
cores that implement support for 64 bit polynomial multiplication,
there is really no need to check the TIF_NEED_RESCHED particularly
often, and so we can remove the NEON yield check from the assembler
routines.

However, unlike the AEAD or skcipher APIs, the shash/ahash APIs take
arbitrary input lengths, and so there needs to be some sanity check
to ensure that we don't hog the CPU for excessive amounts of time.

So let's simply cap the maximum input size that is processed in one go
to 64 KB.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/ghash-ce-core.S | 39 +++++++++++----------------------------
 arch/arm64/crypto/ghash-ce-glue.c | 16 ++++++++++++----
 2 files changed, 23 insertions(+), 32 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 913e49932ae6..344811c6a0ca 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -213,31 +213,23 @@
 	.endm
 
 	.macro		__pmull_ghash, pn
-	frame_push	5
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-
-0:	ld1		{SHASH.2d}, [x22]
-	ld1		{XL.2d}, [x20]
+	ld1		{SHASH.2d}, [x3]
+	ld1		{XL.2d}, [x1]
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	__pmull_pre_\pn
 
 	/* do the head block first, if supplied */
-	cbz		x23, 1f
-	ld1		{T1.2d}, [x23]
-	mov		x23, xzr
-	b		2f
+	cbz		x4, 0f
+	ld1		{T1.2d}, [x4]
+	mov		x4, xzr
+	b		1f
 
-1:	ld1		{T1.2d}, [x21], #16
-	sub		w19, w19, #1
+0:	ld1		{T1.2d}, [x2], #16
+	sub		w0, w0, #1
 
-2:	/* multiply XL by SHASH in GF(2^128) */
+1:	/* multiply XL by SHASH in GF(2^128) */
 CPU_LE(	rev64		T1.16b, T1.16b	)
 
 	ext		T2.16b, XL.16b, XL.16b, #8
@@ -259,18 +251,9 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	eor		T2.16b, T2.16b, XH.16b
 	eor		XL.16b, XL.16b, T2.16b
 
-	cbz		w19, 3f
-
-	if_will_cond_yield_neon
-	st1		{XL.2d}, [x20]
-	do_cond_yield_neon
-	b		0b
-	endif_yield_neon
-
-	b		1b
+	cbnz		w0, 0b
 
-3:	st1		{XL.2d}, [x20]
-	frame_pop
+	st1		{XL.2d}, [x1]
 	ret
 	.endm
 
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 42a0e84e276c..3c2c446dc96c 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -113,6 +113,9 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
 	}
 }
 
+/* avoid hogging the CPU for too long */
+#define MAX_BLOCKS	(SZ_64K / GHASH_BLOCK_SIZE)
+
 static int ghash_update(struct shash_desc *desc, const u8 *src,
 			unsigned int len)
 {
@@ -136,11 +139,16 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 		blocks = len / GHASH_BLOCK_SIZE;
 		len %= GHASH_BLOCK_SIZE;
 
-		ghash_do_update(blocks, ctx->digest, src, key,
-				partial ? ctx->buf : NULL);
+		do {
+			int chunk = min(blocks, MAX_BLOCKS);
+
+			ghash_do_update(chunk, ctx->digest, src, key,
+					partial ? ctx->buf : NULL);
 
-		src += blocks * GHASH_BLOCK_SIZE;
-		partial = 0;
+			blocks -= chunk;
+			src += chunk * GHASH_BLOCK_SIZE;
+			partial = 0;
+		} while (unlikely(blocks > 0));
 	}
 	if (len)
 		memcpy(ctx->buf + partial, src, len);
-- 
cgit v1.2.3


From 22240df7ac6d76a271197571a7be45addef2ba15 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 4 Aug 2018 20:46:25 +0200
Subject: crypto: arm64/ghash-ce - implement 4-way aggregation

Enhance the GHASH implementation that uses 64-bit polynomial
multiplication by adding support for 4-way aggregation. This
more than doubles the performance, from 2.4 cycles per byte
to 1.1 cpb on Cortex-A53.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/ghash-ce-core.S | 122 ++++++++++++++++++++++++++++++++------
 arch/arm64/crypto/ghash-ce-glue.c |  71 +++++++++++-----------
 2 files changed, 142 insertions(+), 51 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 344811c6a0ca..1b319b716d5e 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -46,6 +46,19 @@
 	ss3		.req	v26
 	ss4		.req	v27
 
+	XL2		.req	v8
+	XM2		.req	v9
+	XH2		.req	v10
+	XL3		.req	v11
+	XM3		.req	v12
+	XH3		.req	v13
+	TT3		.req	v14
+	TT4		.req	v15
+	HH		.req	v16
+	HH3		.req	v17
+	HH4		.req	v18
+	HH34		.req	v19
+
 	.text
 	.arch		armv8-a+crypto
 
@@ -134,11 +147,25 @@
 	.endm
 
 	.macro		__pmull_pre_p64
+	add		x8, x3, #16
+	ld1		{HH.2d-HH4.2d}, [x8]
+
+	trn1		SHASH2.2d, SHASH.2d, HH.2d
+	trn2		T1.2d, SHASH.2d, HH.2d
+	eor		SHASH2.16b, SHASH2.16b, T1.16b
+
+	trn1		HH34.2d, HH3.2d, HH4.2d
+	trn2		T1.2d, HH3.2d, HH4.2d
+	eor		HH34.16b, HH34.16b, T1.16b
+
 	movi		MASK.16b, #0xe1
 	shl		MASK.2d, MASK.2d, #57
 	.endm
 
 	.macro		__pmull_pre_p8
+	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
+
 	// k00_16 := 0x0000000000000000_000000000000ffff
 	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
 	movi		k32_48.2d, #0xffffffff
@@ -215,8 +242,6 @@
 	.macro		__pmull_ghash, pn
 	ld1		{SHASH.2d}, [x3]
 	ld1		{XL.2d}, [x1]
-	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
-	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	__pmull_pre_\pn
 
@@ -224,12 +249,79 @@
 	cbz		x4, 0f
 	ld1		{T1.2d}, [x4]
 	mov		x4, xzr
-	b		1f
+	b		3f
+
+0:	.ifc		\pn, p64
+	tbnz		w0, #0, 2f		// skip until #blocks is a
+	tbnz		w0, #1, 2f		// round multiple of 4
+
+1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
+
+	sub		w0, w0, #4
+
+	rev64		T1.16b, XM3.16b
+	rev64		T2.16b, XH3.16b
+	rev64		TT4.16b, TT4.16b
+	rev64		TT3.16b, TT3.16b
+
+	ext		IN1.16b, TT4.16b, TT4.16b, #8
+	ext		XL3.16b, TT3.16b, TT3.16b, #8
+
+	eor		TT4.16b, TT4.16b, IN1.16b
+	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
+	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
+	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
+
+	eor		TT3.16b, TT3.16b, XL3.16b
+	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
+	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
+	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
+
+	ext		IN1.16b, T2.16b, T2.16b, #8
+	eor		XL2.16b, XL2.16b, XL3.16b
+	eor		XH2.16b, XH2.16b, XH3.16b
+	eor		XM2.16b, XM2.16b, XM3.16b
+
+	eor		T2.16b, T2.16b, IN1.16b
+	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
+	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
+	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
 
-0:	ld1		{T1.2d}, [x2], #16
+	eor		XL2.16b, XL2.16b, XL3.16b
+	eor		XH2.16b, XH2.16b, XH3.16b
+	eor		XM2.16b, XM2.16b, XM3.16b
+
+	ext		IN1.16b, T1.16b, T1.16b, #8
+	ext		TT3.16b, XL.16b, XL.16b, #8
+	eor		XL.16b, XL.16b, IN1.16b
+	eor		T1.16b, T1.16b, TT3.16b
+
+	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
+	eor		T1.16b, T1.16b, XL.16b
+	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
+	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
+
+	eor		XL.16b, XL.16b, XL2.16b
+	eor		XH.16b, XH.16b, XH2.16b
+	eor		XM.16b, XM.16b, XM2.16b
+
+	eor		T2.16b, XL.16b, XH.16b
+	ext		T1.16b, XL.16b, XH.16b, #8
+	eor		XM.16b, XM.16b, T2.16b
+
+	__pmull_reduce_p64
+
+	eor		T2.16b, T2.16b, XH.16b
+	eor		XL.16b, XL.16b, T2.16b
+
+	cbz		w0, 5f
+	b		1b
+	.endif
+
+2:	ld1		{T1.2d}, [x2], #16
 	sub		w0, w0, #1
 
-1:	/* multiply XL by SHASH in GF(2^128) */
+3:	/* multiply XL by SHASH in GF(2^128) */
 CPU_LE(	rev64		T1.16b, T1.16b	)
 
 	ext		T2.16b, XL.16b, XL.16b, #8
@@ -242,7 +334,7 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
 	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
 
-	eor		T2.16b, XL.16b, XH.16b
+4:	eor		T2.16b, XL.16b, XH.16b
 	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		XM.16b, XM.16b, T2.16b
 
@@ -253,7 +345,7 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 
 	cbnz		w0, 0b
 
-	st1		{XL.2d}, [x1]
+5:	st1		{XL.2d}, [x1]
 	ret
 	.endm
 
@@ -269,14 +361,10 @@ ENTRY(pmull_ghash_update_p8)
 	__pmull_ghash	p8
 ENDPROC(pmull_ghash_update_p8)
 
-	KS0		.req	v8
-	KS1		.req	v9
-	INP0		.req	v10
-	INP1		.req	v11
-	HH		.req	v12
-	XL2		.req	v13
-	XM2		.req	v14
-	XH2		.req	v15
+	KS0		.req	v12
+	KS1		.req	v13
+	INP0		.req	v14
+	INP1		.req	v15
 
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
@@ -310,8 +398,8 @@ ENDPROC(pmull_ghash_update_p8)
 	.endm
 
 	.macro		pmull_gcm_do_crypt, enc
-	ld1		{HH.2d}, [x4], #16
-	ld1		{SHASH.2d}, [x4]
+	ld1		{SHASH.2d}, [x4], #16
+	ld1		{HH.2d}, [x4]
 	ld1		{XL.2d}, [x1]
 	ldr		x8, [x5, #8]			// load lower counter
 
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 3c2c446dc96c..6e9f33d14930 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -33,9 +33,12 @@ MODULE_ALIAS_CRYPTO("ghash");
 #define GCM_IV_SIZE		12
 
 struct ghash_key {
-	u64 a;
-	u64 b;
-	be128 k;
+	u64			h[2];
+	u64			h2[2];
+	u64			h3[2];
+	u64			h4[2];
+
+	be128			k;
 };
 
 struct ghash_desc_ctx {
@@ -46,7 +49,6 @@ struct ghash_desc_ctx {
 
 struct gcm_aes_ctx {
 	struct crypto_aes_ctx	aes_key;
-	u64			h2[2];
 	struct ghash_key	ghash_key;
 };
 
@@ -63,11 +65,12 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
 				  const char *head);
 
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
-				  const u8 src[], u64 const *k, u8 ctr[],
-				  u32 const rk[], int rounds, u8 ks[]);
+				  const u8 src[], struct ghash_key const *k,
+				  u8 ctr[], u32 const rk[], int rounds,
+				  u8 ks[]);
 
 asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
-				  const u8 src[], u64 const *k,
+				  const u8 src[], struct ghash_key const *k,
 				  u8 ctr[], u32 const rk[], int rounds);
 
 asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
@@ -174,23 +177,36 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
 	return 0;
 }
 
+static void ghash_reflect(u64 h[], const be128 *k)
+{
+	u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0;
+
+	h[0] = (be64_to_cpu(k->b) << 1) | carry;
+	h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+
+	if (carry)
+		h[1] ^= 0xc200000000000000UL;
+}
+
 static int __ghash_setkey(struct ghash_key *key,
 			  const u8 *inkey, unsigned int keylen)
 {
-	u64 a, b;
+	be128 h;
 
 	/* needed for the fallback */
 	memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
 
-	/* perform multiplication by 'x' in GF(2^128) */
-	b = get_unaligned_be64(inkey);
-	a = get_unaligned_be64(inkey + 8);
+	ghash_reflect(key->h, &key->k);
+
+	h = key->k;
+	gf128mul_lle(&h, &key->k);
+	ghash_reflect(key->h2, &h);
 
-	key->a = (a << 1) | (b >> 63);
-	key->b = (b << 1) | (a >> 63);
+	gf128mul_lle(&h, &key->k);
+	ghash_reflect(key->h3, &h);
 
-	if (b >> 63)
-		key->b ^= 0xc200000000000000UL;
+	gf128mul_lle(&h, &key->k);
+	ghash_reflect(key->h4, &h);
 
 	return 0;
 }
@@ -240,8 +256,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 		      unsigned int keylen)
 {
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
-	be128 h1, h2;
-	u8 *key = (u8 *)&h1;
+	u8 key[GHASH_BLOCK_SIZE];
 	int ret;
 
 	ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
@@ -253,19 +268,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 	__aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
 			    num_rounds(&ctx->aes_key));
 
-	__ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
-
-	/* calculate H^2 (used for 2-way aggregation) */
-	h2 = h1;
-	gf128mul_lle(&h2, &h1);
-
-	ctx->h2[0] = (be64_to_cpu(h2.b) << 1) | (be64_to_cpu(h2.a) >> 63);
-	ctx->h2[1] = (be64_to_cpu(h2.a) << 1) | (be64_to_cpu(h2.b) >> 63);
-
-	if (be64_to_cpu(h2.a) >> 63)
-		ctx->h2[1] ^= 0xc200000000000000UL;
-
-	return 0;
+	return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
 }
 
 static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -401,8 +404,8 @@ static int gcm_encrypt(struct aead_request *req)
 				kernel_neon_begin();
 
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
-					  walk.src.virt.addr, ctx->h2, iv,
-					  rk, nrounds, ks);
+					  walk.src.virt.addr, &ctx->ghash_key,
+					  iv, rk, nrounds, ks);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
@@ -512,8 +515,8 @@ static int gcm_decrypt(struct aead_request *req)
 				kernel_neon_begin();
 
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
-					  walk.src.virt.addr, ctx->h2, iv,
-					  rk, nrounds);
+					  walk.src.virt.addr, &ctx->ghash_key,
+					  iv, rk, nrounds);
 
 			/* check if this is the final iteration of the loop */
 			if (rem < (2 * AES_BLOCK_SIZE)) {
-- 
cgit v1.2.3