From 4d80d6ca5d77fde9880da8466e5b64f250e5bf82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:26 +0200
Subject: genirq: Export affinity setter for modules

Perf modules abuse irq_set_affinity_hint() to set the affinity of system
PMU interrupts just because irq_set_affinity() was not exported.

The fact that irq_set_affinity_hint() actually sets the affinity is a
non-documented side effect and the name is clearly saying it's a hint.

To clean this up, export the real affinity setter.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093117.968251441@linutronix.de
---
 include/linux/interrupt.h | 35 ++---------------------------------
 kernel/irq/manage.c       | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4777850a6dc7..35a374241515 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -319,39 +319,8 @@ struct irq_affinity_desc {
 
 extern cpumask_var_t irq_default_affinity;
 
-/* Internal implementation. Use the helpers below */
-extern int __irq_set_affinity(unsigned int irq, const struct cpumask *cpumask,
-			      bool force);
-
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Fails if cpumask does not contain an online CPU
- */
-static inline int
-irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, false);
-}
-
-/**
- * irq_force_affinity - Force the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Same as irq_set_affinity, but without checking the mask against
- * online cpus.
- *
- * Solely for low level cpu hotplug code, where we need to make per
- * cpu interrupts affine before the cpu becomes online.
- */
-static inline int
-irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, true);
-}
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c14356543d9..a847dd2044c8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -441,7 +441,8 @@ out_unlock:
 	return ret;
 }
 
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+			      bool force)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -456,6 +457,36 @@ int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
 	return ret;
 }
 
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
+
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From 84fca8ba620581067c16f2b578f277b1c72fb74b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:27 +0200
Subject: perf/arm-ccn: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.128250213@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-ccn.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 96d47cb302dd..a96c31604545 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -1211,7 +1211,7 @@ static int arm_ccn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 	perf_pmu_migrate_context(&dt->pmu, cpu, target);
 	dt->cpu = target;
 	if (ccn->irq)
-		WARN_ON(irq_set_affinity_hint(ccn->irq, cpumask_of(dt->cpu)));
+		WARN_ON(irq_set_affinity(ccn->irq, cpumask_of(dt->cpu)));
 	return 0;
 }
 
@@ -1291,7 +1291,7 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn)
 
 	/* Also make sure that the overflow interrupt is handled by this CPU */
 	if (ccn->irq) {
-		err = irq_set_affinity_hint(ccn->irq, cpumask_of(ccn->dt.cpu));
+		err = irq_set_affinity(ccn->irq, cpumask_of(ccn->dt.cpu));
 		if (err) {
 			dev_err(ccn->dev, "Failed to set interrupt affinity!\n");
 			goto error_set_affinity;
@@ -1325,8 +1325,6 @@ static void arm_ccn_pmu_cleanup(struct arm_ccn *ccn)
 
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE,
 					    &ccn->dt.node);
-	if (ccn->irq)
-		irq_set_affinity_hint(ccn->irq, NULL);
 	for (i = 0; i < ccn->num_xps; i++)
 		writel(0, ccn->xp[i].base + CCN_XP_DT_CONTROL);
 	writel(0, ccn->dt.base + CCN_DT_PMCR);
-- 
cgit v1.2.3


From 8ec25d34012da3bf417a4d16c057a54064626058 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:28 +0200
Subject: perf/arm-cmn: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.277228577@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 56a5c355701d..9417e9c5bcb3 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1162,7 +1162,7 @@ static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 
 	perf_pmu_migrate_context(&cmn->pmu, cpu, target);
 	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity_hint(cmn->dtc[i].irq, cpumask_of(target));
+		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(target));
 	cmn->cpu = target;
 	return 0;
 }
@@ -1222,7 +1222,7 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 		if (err)
 			return err;
 
-		err = irq_set_affinity_hint(irq, cpumask_of(cmn->cpu));
+		err = irq_set_affinity(irq, cpumask_of(cmn->cpu));
 		if (err)
 			return err;
 	next:
@@ -1568,16 +1568,11 @@ static int arm_cmn_probe(struct platform_device *pdev)
 static int arm_cmn_remove(struct platform_device *pdev)
 {
 	struct arm_cmn *cmn = platform_get_drvdata(pdev);
-	int i;
 
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
 
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
-
-	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity_hint(cmn->dtc[i].irq, NULL);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1ceeb8d430f5ea780b8f7d02466a7454cc845528 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:29 +0200
Subject: perf/arm-dmc620: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.395086573@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_dmc620_pmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index b6c2511d59af..280a6ae3e27c 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -421,7 +421,7 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
 	if (ret)
 		goto out_free_aff;
 
-	ret = irq_set_affinity_hint(irq_num, cpumask_of(irq->cpu));
+	ret = irq_set_affinity(irq_num, cpumask_of(irq->cpu));
 	if (ret)
 		goto out_free_irq;
 
@@ -475,7 +475,6 @@ static void dmc620_pmu_put_irq(struct dmc620_pmu *dmc620_pmu)
 	list_del(&irq->irqs_node);
 	mutex_unlock(&dmc620_pmu_irqs_lock);
 
-	WARN_ON(irq_set_affinity_hint(irq->irq_num, NULL));
 	free_irq(irq->irq_num, irq);
 	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &irq->node);
 	kfree(irq);
@@ -622,7 +621,7 @@ static int dmc620_pmu_cpu_teardown(unsigned int cpu,
 		perf_pmu_migrate_context(&dmc620_pmu->pmu, irq->cpu, target);
 	mutex_unlock(&dmc620_pmu_irqs_lock);
 
-	WARN_ON(irq_set_affinity_hint(irq->irq_num, cpumask_of(target)));
+	WARN_ON(irq_set_affinity(irq->irq_num, cpumask_of(target)));
 	irq->cpu = target;
 
 	return 0;
-- 
cgit v1.2.3


From 41ea281724c097e15aca1a8522abbfa54a60acde Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:30 +0200
Subject: perf/arm-dsu: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.505110632@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_dsu_pmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index 196faea074d0..a36698a90d2f 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -687,7 +687,7 @@ static void dsu_pmu_probe_pmu(struct dsu_pmu *dsu_pmu)
 static void dsu_pmu_set_active_cpu(int cpu, struct dsu_pmu *dsu_pmu)
 {
 	cpumask_set_cpu(cpu, &dsu_pmu->active_cpu);
-	if (irq_set_affinity_hint(dsu_pmu->irq, &dsu_pmu->active_cpu))
+	if (irq_set_affinity(dsu_pmu->irq, &dsu_pmu->active_cpu))
 		pr_warn("Failed to set irq affinity to %d\n", cpu);
 }
 
@@ -769,7 +769,6 @@ static int dsu_pmu_device_probe(struct platform_device *pdev)
 	if (rc) {
 		cpuhp_state_remove_instance(dsu_pmu_cpuhp_state,
 						 &dsu_pmu->cpuhp_node);
-		irq_set_affinity_hint(dsu_pmu->irq, NULL);
 	}
 
 	return rc;
@@ -781,7 +780,6 @@ static int dsu_pmu_device_remove(struct platform_device *pdev)
 
 	perf_pmu_unregister(&dsu_pmu->pmu);
 	cpuhp_state_remove_instance(dsu_pmu_cpuhp_state, &dsu_pmu->cpuhp_node);
-	irq_set_affinity_hint(dsu_pmu->irq, NULL);
 
 	return 0;
 }
@@ -840,10 +838,8 @@ static int dsu_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
 
 	dst = dsu_pmu_get_online_cpu_any_but(dsu_pmu, cpu);
 	/* If there are no active CPUs in the DSU, leave IRQ disabled */
-	if (dst >= nr_cpu_ids) {
-		irq_set_affinity_hint(dsu_pmu->irq, NULL);
+	if (dst >= nr_cpu_ids)
 		return 0;
-	}
 
 	perf_pmu_migrate_context(&dsu_pmu->pmu, cpu, dst);
 	dsu_pmu_set_active_cpu(dst, dsu_pmu);
-- 
cgit v1.2.3


From 2621054535955fb78ea96b76b279eb481f40fcef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:31 +0200
Subject: perf/arm-smmuv3: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.603636289@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index ff6fab4bae30..7786ccc6d12f 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -628,7 +628,7 @@ static int smmu_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 
 	perf_pmu_migrate_context(&smmu_pmu->pmu, cpu, target);
 	smmu_pmu->on_cpu = target;
-	WARN_ON(irq_set_affinity_hint(smmu_pmu->irq, cpumask_of(target)));
+	WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(target)));
 
 	return 0;
 }
@@ -839,15 +839,14 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 
 	/* Pick one CPU to be the preferred one to use */
 	smmu_pmu->on_cpu = raw_smp_processor_id();
-	WARN_ON(irq_set_affinity_hint(smmu_pmu->irq,
-				      cpumask_of(smmu_pmu->on_cpu)));
+	WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(smmu_pmu->on_cpu)));
 
 	err = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
 					       &smmu_pmu->node);
 	if (err) {
 		dev_err(dev, "Error %d registering hotplug, PMU @%pa\n",
 			err, &res_0->start);
-		goto out_clear_affinity;
+		return err;
 	}
 
 	err = perf_pmu_register(&smmu_pmu->pmu, name, -1);
@@ -866,8 +865,6 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 
 out_unregister:
 	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
-out_clear_affinity:
-	irq_set_affinity_hint(smmu_pmu->irq, NULL);
 	return err;
 }
 
@@ -877,7 +874,6 @@ static int smmu_pmu_remove(struct platform_device *pdev)
 
 	perf_pmu_unregister(&smmu_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
-	irq_set_affinity_hint(smmu_pmu->irq, NULL);
 
 	return 0;
 }
-- 
cgit v1.2.3


From ba4489fb949cbd9c9b877dceae361129ed6280f1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:32 +0200
Subject: perf/imx_ddr: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Frank Li <Frank.li@nxp.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Pengutronix Kernel Team <kernel@pengutronix.de>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: NXP Linux Team <linux-imx@nxp.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.699566062@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/fsl_imx8_ddr_perf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 2bbb93188064..df048fe42fc2 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -674,7 +674,7 @@ static int ddr_perf_offline_cpu(unsigned int cpu, struct hlist_node *node)
 	perf_pmu_migrate_context(&pmu->pmu, cpu, target);
 	pmu->cpu = target;
 
-	WARN_ON(irq_set_affinity_hint(pmu->irq, cpumask_of(pmu->cpu)));
+	WARN_ON(irq_set_affinity(pmu->irq, cpumask_of(pmu->cpu)));
 
 	return 0;
 }
@@ -749,7 +749,7 @@ static int ddr_perf_probe(struct platform_device *pdev)
 	}
 
 	pmu->irq = irq;
-	ret = irq_set_affinity_hint(pmu->irq, cpumask_of(pmu->cpu));
+	ret = irq_set_affinity(pmu->irq, cpumask_of(pmu->cpu));
 	if (ret) {
 		dev_err(pmu->dev, "Failed to set interrupt affinity!\n");
 		goto ddr_perf_err;
@@ -777,7 +777,6 @@ static int ddr_perf_remove(struct platform_device *pdev)
 
 	cpuhp_state_remove_instance_nocalls(pmu->cpuhp_state, &pmu->node);
 	cpuhp_remove_multi_state(pmu->cpuhp_state);
-	irq_set_affinity_hint(pmu->irq, NULL);
 
 	perf_pmu_unregister(&pmu->pmu);
 
-- 
cgit v1.2.3


From 77b06ddc04354293f746d0434f00700110d3392d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:33 +0200
Subject: perf/hisi: Use irq_set_affinity()

These drivers use irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Shaokun Zhang <zhangshaokun@hisilicon.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.813375875@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 3 ---
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c  | 3 ---
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c  | 3 ---
 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c   | 3 ---
 drivers/perf/hisilicon/hisi_uncore_pmu.c      | 4 ++--
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 3 ---
 6 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
index 7c8a4bc21db4..0c7777bf1542 100644
--- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -537,7 +537,6 @@ static int hisi_ddrc_pmu_probe(struct platform_device *pdev)
 		dev_err(ddrc_pmu->dev, "DDRC PMU register failed!\n");
 		cpuhp_state_remove_instance_nocalls(
 			CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, &ddrc_pmu->node);
-		irq_set_affinity_hint(ddrc_pmu->irq, NULL);
 	}
 
 	return ret;
@@ -550,8 +549,6 @@ static int hisi_ddrc_pmu_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&ddrc_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
 					    &ddrc_pmu->node);
-	irq_set_affinity_hint(ddrc_pmu->irq, NULL);
-
 	return 0;
 }
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 0316fabe32f1..12eb41ab1b8a 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -540,7 +540,6 @@ static int hisi_hha_pmu_probe(struct platform_device *pdev)
 		dev_err(hha_pmu->dev, "HHA PMU register failed!\n");
 		cpuhp_state_remove_instance_nocalls(
 			CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, &hha_pmu->node);
-		irq_set_affinity_hint(hha_pmu->irq, NULL);
 	}
 
 	return ret;
@@ -553,8 +552,6 @@ static int hisi_hha_pmu_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&hha_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
 					    &hha_pmu->node);
-	irq_set_affinity_hint(hha_pmu->irq, NULL);
-
 	return 0;
 }
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index bf9f7772cac9..773f69538090 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -578,7 +578,6 @@ static int hisi_l3c_pmu_probe(struct platform_device *pdev)
 		dev_err(l3c_pmu->dev, "L3C PMU register failed!\n");
 		cpuhp_state_remove_instance_nocalls(
 			CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, &l3c_pmu->node);
-		irq_set_affinity_hint(l3c_pmu->irq, NULL);
 	}
 
 	return ret;
@@ -591,8 +590,6 @@ static int hisi_l3c_pmu_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&l3c_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 					    &l3c_pmu->node);
-	irq_set_affinity_hint(l3c_pmu->irq, NULL);
-
 	return 0;
 }
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index 14f23eb31248..e1f71eab5640 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -436,7 +436,6 @@ static int hisi_pa_pmu_probe(struct platform_device *pdev)
 		dev_err(pa_pmu->dev, "PMU register failed, ret = %d\n", ret);
 		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 					    &pa_pmu->node);
-		irq_set_affinity_hint(pa_pmu->irq, NULL);
 		return ret;
 	}
 
@@ -451,8 +450,6 @@ static int hisi_pa_pmu_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&pa_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 					    &pa_pmu->node);
-	irq_set_affinity_hint(pa_pmu->irq, NULL);
-
 	return 0;
 }
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c
index 13c68b5e39c4..5842593632e4 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -488,7 +488,7 @@ int hisi_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
 	hisi_pmu->on_cpu = cpu;
 
 	/* Overflow interrupt also should use the same CPU */
-	WARN_ON(irq_set_affinity_hint(hisi_pmu->irq, cpumask_of(cpu)));
+	WARN_ON(irq_set_affinity(hisi_pmu->irq, cpumask_of(cpu)));
 
 	return 0;
 }
@@ -521,7 +521,7 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 	perf_pmu_migrate_context(&hisi_pmu->pmu, cpu, target);
 	/* Use this CPU for event counting */
 	hisi_pmu->on_cpu = target;
-	WARN_ON(irq_set_affinity_hint(hisi_pmu->irq, cpumask_of(target)));
+	WARN_ON(irq_set_affinity(hisi_pmu->irq, cpumask_of(target)));
 
 	return 0;
 }
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 46be312fa126..835ec3e2178f 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -465,7 +465,6 @@ static int hisi_sllc_pmu_probe(struct platform_device *pdev)
 		dev_err(sllc_pmu->dev, "PMU register failed, ret = %d\n", ret);
 		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
 					    &sllc_pmu->node);
-		irq_set_affinity_hint(sllc_pmu->irq, NULL);
 		return ret;
 	}
 
@@ -481,8 +480,6 @@ static int hisi_sllc_pmu_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&sllc_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
 					    &sllc_pmu->node);
-	irq_set_affinity_hint(sllc_pmu->irq, NULL);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7d7b720a4b8049446cffce870b1dd3ffa89d4b40 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 10 May 2021 12:00:26 +0100
Subject: arm64: Implement stack trace termination record

Reliable stacktracing requires that we identify when a stacktrace is
terminated early. We can do this by ensuring all tasks have a final
frame record at a known location on their task stack, and checking
that this is the final frame record in the chain.

We'd like to use task_pt_regs(task)->stackframe as the final frame
record, as this is already setup upon exception entry from EL0. For
kernel tasks we need to consistently reserve the pt_regs and point x29
at this, which we can do with small changes to __primary_switched,
__secondary_switched, and copy_process().

Since the final frame record must be at a specific location, we must
create the final frame record in __primary_switched and
__secondary_switched rather than leaving this to start_kernel and
secondary_start_kernel. Thus, __primary_switched and
__secondary_switched will now show up in stacktraces for the idle tasks.

Since the final frame record is now identified by its location rather
than by its contents, we identify it at the start of unwind_frame(),
before we read any values from it.

External debuggers may terminate the stack trace when FP == 0. In the
pt_regs->stackframe, the PC is 0 as well. So, stack traces taken in the
debugger may print an extra record 0x0 at the end. While this is not
pretty, this does not do any harm. This is a small price to pay for
having reliable stack trace termination in the kernel. That said, gdb
does not show the extra record probably because it uses DWARF and not
frame pointers for stack traces.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
[Mark: rebase, use ASM_BUG(), update comments, update commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210510110026.18061-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry.S      |  2 +-
 arch/arm64/kernel/head.S       | 25 +++++++++++++++++++------
 arch/arm64/kernel/process.c    |  5 +++++
 arch/arm64/kernel/stacktrace.c | 16 +++++++---------
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3513984a88bd..294f24e16fee 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -285,7 +285,7 @@ alternative_else_nop_endif
 	stp	lr, x21, [sp, #S_LR]
 
 	/*
-	 * For exceptions from EL0, create a terminal frame record.
+	 * For exceptions from EL0, create a final frame record.
 	 * For exceptions from EL1, create a synthetic frame record so the
 	 * interrupted code shows up in the backtrace.
 	 */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 96873dfa67fd..cc2d45d54838 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -16,6 +16,7 @@
 #include <asm/asm_pointer_auth.h>
 #include <asm/assembler.h>
 #include <asm/boot.h>
+#include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
@@ -393,6 +394,18 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 	ret	x28
 SYM_FUNC_END(__create_page_tables)
 
+	/*
+	 * Create a final frame record at task_pt_regs(current)->stackframe, so
+	 * that the unwinder can identify the final frame record of any task by
+	 * its location in the task stack. We reserve the entire pt_regs space
+	 * for consistency with user tasks and kthreads.
+	 */
+	.macro setup_final_frame
+	sub	sp, sp, #PT_REGS_SIZE
+	stp	xzr, xzr, [sp, #S_STACKFRAME]
+	add	x29, sp, #S_STACKFRAME
+	.endm
+
 /*
  * The following fragment of code is executed with the MMU enabled.
  *
@@ -447,9 +460,9 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #endif
 	bl	switch_to_vhe			// Prefer VHE if possible
 	add	sp, sp, #16
-	mov	x29, #0
-	mov	x30, #0
-	b	start_kernel
+	setup_final_frame
+	bl	start_kernel
+	ASM_BUG()
 SYM_FUNC_END(__primary_switched)
 
 	.pushsection ".rodata", "a"
@@ -639,14 +652,14 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 	cbz	x2, __secondary_too_slow
 	msr	sp_el0, x2
 	scs_load x2, x3
-	mov	x29, #0
-	mov	x30, #0
+	setup_final_frame
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 	ptrauth_keys_init_cpu x2, x3, x4, x5
 #endif
 
-	b	secondary_start_kernel
+	bl	secondary_start_kernel
+	ASM_BUG()
 SYM_FUNC_END(__secondary_switched)
 
 SYM_FUNC_START_LOCAL(__secondary_too_slow)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b4bb67f17a2c..8928fba54e4b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -435,6 +435,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	}
 	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
 	p->thread.cpu_context.sp = (unsigned long)childregs;
+	/*
+	 * For the benefit of the unwinder, set up childregs->stackframe
+	 * as the final frame for the new task.
+	 */
+	p->thread.cpu_context.fp = (unsigned long)childregs->stackframe;
 
 	ptrace_hw_copy_thread(p);
 
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index de07147a7926..36cf05d5eb9e 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -68,12 +68,16 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long fp = frame->fp;
 	struct stack_info info;
 
-	if (fp & 0xf)
-		return -EINVAL;
-
 	if (!tsk)
 		tsk = current;
 
+	/* Final frame; nothing to unwind */
+	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
+		return -ENOENT;
+
+	if (fp & 0xf)
+		return -EINVAL;
+
 	if (!on_accessible_stack(tsk, fp, &info))
 		return -EINVAL;
 
@@ -128,12 +132,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
-	/*
-	 * This is a terminal record, so we have finished unwinding.
-	 */
-	if (!frame->fp && !frame->pc)
-		return -ENOENT;
-
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);
-- 
cgit v1.2.3


From 7e04cc918954f9090952e8d17cb2c3c4a5ad055e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 10 May 2021 17:52:06 +0530
Subject: arm64/mm: Validate CONFIG_PGTABLE_LEVELS

CONFIG_PGTABLE_LEVELS has been statically defined in (arch/arm64/Kconfig)
depending on the page size and requested virtual address range. In order to
validate this page table levels selection this adds a BUILD_BUG_ON() as per
the existing formula ARM64_HW_PGTABLE_LEVELS(). This would help protect any
inadvertent changes to CONFIG_PGTABLE_LEVELS selection.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/1620649326-24115-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/init.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index e55409caaee3..6e1ca044ca90 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -499,6 +499,13 @@ void __init mem_init(void)
 	BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
 #endif
 
+	/*
+	 * Selected page table levels should match when derived from
+	 * scratch using the virtual address range and page size.
+	 */
+	BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
+		     CONFIG_PGTABLE_LEVELS);
+
 	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
 		extern int sysctl_overcommit_memory;
 		/*
-- 
cgit v1.2.3


From bf2367aaed73f06a43c0be3c61dafdc59f986161 Mon Sep 17 00:00:00 2001
From: Zou Wei <zou_wei@huawei.com>
Date: Tue, 11 May 2021 14:42:44 +0800
Subject: drivers/perf: Remove redundant dev_err call in
 tx2_uncore_pmu_init_dev()

There is a error message within devm_ioremap_resource
already, so remove the dev_err call to avoid redundant
error message.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zou Wei <zou_wei@huawei.com>
Link: https://lore.kernel.org/r/1620715364-107460-1-git-send-email-zou_wei@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/thunderx2_pmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c
index 06a6d569b0b5..fc1a376ee906 100644
--- a/drivers/perf/thunderx2_pmu.c
+++ b/drivers/perf/thunderx2_pmu.c
@@ -817,10 +817,8 @@ static struct tx2_uncore_pmu *tx2_uncore_pmu_init_dev(struct device *dev,
 	}
 
 	base = devm_ioremap_resource(dev, &res);
-	if (IS_ERR(base)) {
-		dev_err(dev, "PMU type %d: Fail to map resource\n", type);
+	if (IS_ERR(base))
 		return NULL;
-	}
 
 	tx2_pmu = devm_kzalloc(dev, sizeof(*tx2_pmu), GFP_KERNEL);
 	if (!tx2_pmu)
-- 
cgit v1.2.3


From 27e4482075718997b366e19eaa81aeb7e42e1df3 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao2@hisilicon.com>
Date: Tue, 11 May 2021 20:27:31 +0800
Subject: drivers/perf: arm_spe_pmu: Fix some coding style issues

Fix some coding style issues reported by checkpatch.pl, including
following types:

WARNING: void function return statements are not generally useful
WARNING: Possible unnecessary 'out of memory' message

Signed-off-by: Junhao He <hejunhao2@hisilicon.com>
Signed-off-by: Jay Fang <f.fangjian@huawei.com>
Link: https://lore.kernel.org/r/1620736054-58412-2-git-send-email-f.fangjian@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_spe_pmu.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 8a1e86ab2d8e..e3711cb4c1b5 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -1044,7 +1044,6 @@ static void __arm_spe_pmu_dev_probe(void *info)
 		 spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features);
 
 	spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED;
-	return;
 }
 
 static void __arm_spe_pmu_reset_local(void)
@@ -1190,10 +1189,8 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev)
 	}
 
 	spe_pmu = devm_kzalloc(dev, sizeof(*spe_pmu), GFP_KERNEL);
-	if (!spe_pmu) {
-		dev_err(dev, "failed to allocate spe_pmu\n");
+	if (!spe_pmu)
 		return -ENOMEM;
-	}
 
 	spe_pmu->handle = alloc_percpu(typeof(*spe_pmu->handle));
 	if (!spe_pmu->handle)
-- 
cgit v1.2.3


From f265fd166bce9837ce1ae6c2a4b56f8bd18d1fe4 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao2@hisilicon.com>
Date: Tue, 11 May 2021 20:27:32 +0800
Subject: drivers/perf: arm_pmu: Fix some coding style issues

Fix some coding style issues reported by checkpatch.pl, including
following types:

ERROR: spaces required around that '=' (ctx:VxW)
WARNING: Possible unnecessary 'out of memory' message

Signed-off-by: Junhao He <hejunhao2@hisilicon.com>
Signed-off-by: Jay Fang <f.fangjian@huawei.com>
Link: https://lore.kernel.org/r/1620736054-58412-3-git-send-email-f.fangjian@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index d4f7f1f9cc77..e57b348c1628 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -670,7 +670,7 @@ int armpmu_request_irq(int irq, int cpu)
 						 &cpu_armpmu);
 			irq_ops = &percpu_pmuirq_ops;
 		} else {
-			has_nmi= true;
+			has_nmi = true;
 			irq_ops = &percpu_pmunmi_ops;
 		}
 	} else {
@@ -869,10 +869,8 @@ static struct arm_pmu *__armpmu_alloc(gfp_t flags)
 	int cpu;
 
 	pmu = kzalloc(sizeof(*pmu), flags);
-	if (!pmu) {
-		pr_info("failed to allocate PMU device!\n");
+	if (!pmu)
 		goto out;
-	}
 
 	pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, flags);
 	if (!pmu->hw_events) {
-- 
cgit v1.2.3


From a9f00c9760febb84215bcb489855b5b23e3ab4dc Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao2@hisilicon.com>
Date: Tue, 11 May 2021 20:27:33 +0800
Subject: drivers/perf: arm-cmn: Add space after ','

Fix a warning from checkpatch.pl.

ERROR: space required after that ',' (ctx:VxV)

Signed-off-by: Junhao He <hejunhao2@hisilicon.com>
Signed-off-by: Jay Fang <f.fangjian@huawei.com>
Link: https://lore.kernel.org/r/1620736054-58412-4-git-send-email-f.fangjian@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 9417e9c5bcb3..4f46f654279d 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -31,7 +31,7 @@
 #define CMN_CI_CHILD_COUNT		GENMASK_ULL(15, 0)
 #define CMN_CI_CHILD_PTR_OFFSET		GENMASK_ULL(31, 16)
 
-#define CMN_CHILD_NODE_ADDR		GENMASK(27,0)
+#define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
 #define CMN_ADDR_NODE_PTR		GENMASK(27, 14)
-- 
cgit v1.2.3


From eb2b22f024c3615d576cead56f2a7d2c90355716 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao2@hisilicon.com>
Date: Tue, 11 May 2021 20:27:34 +0800
Subject: drivers/perf: arm-cci: Fix checkpatch spacing error

Fix some coding style issues reported by checkpatch.pl, including
following types:

ERROR: need consistent spacing around '-' (ctx:WxV)
ERROR: space required before the open parenthesis '('

Signed-off-by: Junhao He <hejunhao2@hisilicon.com>
Signed-off-by: Jay Fang <f.fangjian@huawei.com>
Link: https://lore.kernel.org/r/1620736054-58412-5-git-send-email-f.fangjian@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 666d8a9b557f..54aca3a62814 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -37,7 +37,7 @@
 
 #define CCI_PMU_CNTR_SIZE(model)	((model)->cntr_size)
 #define CCI_PMU_CNTR_BASE(model, idx)	((idx) * CCI_PMU_CNTR_SIZE(model))
-#define CCI_PMU_CNTR_MASK		((1ULL << 32) -1)
+#define CCI_PMU_CNTR_MASK		((1ULL << 32) - 1)
 #define CCI_PMU_CNTR_LAST(cci_pmu)	(cci_pmu->num_cntrs - 1)
 
 #define CCI_PMU_MAX_HW_CNTRS(model) \
@@ -806,7 +806,7 @@ static int pmu_get_event_idx(struct cci_pmu_hw_events *hw, struct perf_event *ev
 		return cci_pmu->model->get_event_idx(cci_pmu, hw, cci_event);
 
 	/* Generic code to find an unused idx from the mask */
-	for(idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++)
+	for (idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++)
 		if (!test_and_set_bit(idx, hw->used_mask))
 			return idx;
 
-- 
cgit v1.2.3


From e377ab82311af95c99648c6424a6b888a0ccb102 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 10 May 2021 16:37:51 +0530
Subject: arm64/mm: Remove [PUD|PMD]_TABLE_BIT from [pud|pmd]_bad()

Semantics wise, [pud|pmd]_bad() have always implied that a given [PUD|PMD]
entry does not have a pointer to the next level page table. This had been
made clear in the commit a1c76574f345 ("arm64: mm: use *_sect to check for
section maps"). Hence explicitly check for a table entry rather than just
testing a single bit. This basically redefines [pud|pmd]_bad() in terms of
[pud|pmd]_table() making the semantics clear.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1620644871-26280-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/pgtable.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0b10204e72fc..11e60d0cd9b6 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -511,13 +511,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 
-#define pmd_bad(pmd)		(!(pmd_val(pmd) & PMD_TABLE_BIT))
-
 #define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
 #define pmd_leaf(pmd)		pmd_sect(pmd)
+#define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #define pmd_leaf_size(pmd)	(pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
 #define pte_leaf_size(pte)	(pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
@@ -604,7 +603,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 	pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
+#define pud_bad(pud)		(!pud_table(pud))
 #define pud_present(pud)	pte_present(pud_pte(pud))
 #define pud_leaf(pud)		pud_sect(pud)
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
-- 
cgit v1.2.3


From ca940790d2ddc91e976f1e9e685052a54a1c50cf Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 12 May 2021 17:23:50 +0100
Subject: arm64: Document requirement for access to FEAT_HCX

v8.7 of the architecture introduced FEAT_HCX which adds an additional
hypervisor configuration register HCRX_EL2. Even though Linux does not
currently make use of this feature let's document that the EL3 trap for
access to the register should be disabled so that we are able to make
use of it in future.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210512162350.20349-1-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/arm64/booting.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst
index 18b8cc1bf32c..a9192e7a231b 100644
--- a/Documentation/arm64/booting.rst
+++ b/Documentation/arm64/booting.rst
@@ -277,6 +277,12 @@ Before jumping into the kernel, the following conditions must be met:
 
     - SCR_EL3.FGTEn (bit 27) must be initialised to 0b1.
 
+  For CPUs with support for HCRX_EL2 (FEAT_HCX) present:
+
+  - If EL3 is present and the kernel is entered at EL2:
+
+    - SCR_EL3.HXEn (bit 38) must be initialised to 0b1.
+
   For CPUs with Advanced SIMD and floating point support:
 
   - If EL3 is present:
-- 
cgit v1.2.3


From 63ebdb77afa96068ac570e87643eb4cd5b3e31c3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 18 May 2021 17:33:31 +0100
Subject: kselftest/arm64: Add missing newline to SVE test skipping output

The newline is expected to come from the caller but got missed for this
test.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20210518163331.38268-1-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/fp/sve-probe-vls.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/fp/sve-probe-vls.c b/tools/testing/selftests/arm64/fp/sve-probe-vls.c
index b29cbc642c57..76e138525d55 100644
--- a/tools/testing/selftests/arm64/fp/sve-probe-vls.c
+++ b/tools/testing/selftests/arm64/fp/sve-probe-vls.c
@@ -25,7 +25,7 @@ int main(int argc, char **argv)
 	ksft_set_plan(2);
 
 	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
-		ksft_exit_skip("SVE not available");
+		ksft_exit_skip("SVE not available\n");
 
 	/*
 	 * Enumerate up to SVE_VQ_MAX vector lengths
-- 
cgit v1.2.3


From 3d0cca0b02ac98eac9157b26cf3951997db68b37 Mon Sep 17 00:00:00 2001
From: Evgenii Stepanov <eugenis@google.com>
Date: Thu, 20 May 2021 18:00:23 -0700
Subject: kasan: speed up mte_set_mem_tag_range

Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <eugenis@google.com>
Co-developed-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210521010023.3244784-1-eugenis@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/mte-kasan.h | 93 +++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index ddd4d17cf9a0..d952352bd008 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,43 +48,84 @@ static inline u8 mte_get_random_tag(void)
 	return mte_get_ptr_tag(addr);
 }
 
+static inline u64 __stg_post(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "stg %0, [%0], #16"
+		     : "+r"(p)
+		     :
+		     : "memory");
+	return p;
+}
+
+static inline u64 __stzg_post(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "stzg %0, [%0], #16"
+		     : "+r"(p)
+		     :
+		     : "memory");
+	return p;
+}
+
+static inline void __dc_gva(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "dc gva, %0" : : "r"(p) : "memory");
+}
+
+static inline void __dc_gzva(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "dc gzva, %0" : : "r"(p) : "memory");
+}
+
 /*
  * Assign allocation tags for a region of memory based on the pointer tag.
  * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ * size must be MTE_GRANULE_SIZE aligned.
  */
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
-						u8 tag, bool init)
+static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
+					 bool init)
 {
-	u64 curr, end;
+	u64 curr, mask, dczid_bs, end1, end2, end3;
 
-	if (!size)
-		return;
+	/* Read DC G(Z)VA block size from the system register. */
+	dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf);
 
 	curr = (u64)__tag_set(addr, tag);
-	end = curr + size;
+	mask = dczid_bs - 1;
+	/* STG/STZG up to the end of the first block. */
+	end1 = curr | mask;
+	end3 = curr + size;
+	/* DC GVA / GZVA in [end1, end2) */
+	end2 = end3 & ~mask;
 
 	/*
-	 * 'asm volatile' is required to prevent the compiler to move
-	 * the statement outside of the loop.
+	 * The following code uses STG on the first DC GVA block even if the
+	 * start address is aligned - it appears to be faster than an alignment
+	 * check + conditional branch. Also, if the range size is at least 2 DC
+	 * GVA blocks, the first two loops can use post-condition to save one
+	 * branch each.
 	 */
-	if (init) {
-		do {
-			asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	} else {
-		do {
-			asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	}
+#define SET_MEMTAG_RANGE(stg_post, dc_gva)		\
+	do {						\
+		if (size >= 2 * dczid_bs) {		\
+			do {				\
+				curr = stg_post(curr);	\
+			} while (curr < end1);		\
+							\
+			do {				\
+				dc_gva(curr);		\
+				curr += dczid_bs;	\
+			} while (curr < end2);		\
+		}					\
+							\
+		while (curr < end3)			\
+			curr = stg_post(curr);		\
+	} while (0)
+
+	if (init)
+		SET_MEMTAG_RANGE(__stzg_post, __dc_gzva);
+	else
+		SET_MEMTAG_RANGE(__stg_post, __dc_gva);
+#undef SET_MEMTAG_RANGE
 }
 
 void mte_enable_kernel_sync(void);
-- 
cgit v1.2.3


From 40221c737608cf324870c58ef063159c3a6a4c81 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 24 May 2021 13:10:30 +0530
Subject: arm64/mm: Make vmemmap_free() available only with
 CONFIG_MEMORY_HOTPLUG

vmemmap_free() callsites (mm/sparse.c) and declaration (include/linux/mm.h)
are protected with CONFIG_MEMORY_HOTPLUG. This function is not required if
CONFIG_MEMORY_HOTPLUG is not enabled. Hence move the config wrapper outside
the function definition.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/1621842030-23256-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6dd9369e3ea0..3d34cd127f6b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1166,16 +1166,17 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	return 0;
 }
 #endif	/* !ARM64_SWAPPER_USES_SECTION_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
-#ifdef CONFIG_MEMORY_HOTPLUG
 	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
 
 	unmap_hotplug_range(start, end, true, altmap);
 	free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
-#endif
 }
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static inline pud_t *fixmap_pud(unsigned long addr)
 {
-- 
cgit v1.2.3


From e89d6cc51034998607502cd3899173bfa7189571 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 24 May 2021 09:29:44 +0100
Subject: arm64: assembler: replace `kaddr` with `addr`

The `__dcache_op_workaround_clean_cache` and `dcache_by_line_op` macros
are only expected to be usedc on kernel memory, without a user fault
fixup, and so we named their address variables `kaddr` to make this
clear.

Subseuqent patches will modify these to also work on user memory with an
(optional) user fault fixup, where `kaddr` won't make as much sense. To
aid the legibility of patches, this patch (only) replaces `kaddr` with
`addr` as a preparatory step.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Cc: Ard Biesheuvel <aedb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-2-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/assembler.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 8418c1bd8f04..6a0fbc599196 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -377,47 +377,47 @@ alternative_cb_end
 
 /*
  * Macro to perform a data cache maintenance for the interval
- * [kaddr, kaddr + size)
+ * [addr, addr + size)
  *
  * 	op:		operation passed to dc instruction
  * 	domain:		domain used in dsb instruciton
- * 	kaddr:		starting virtual address of the region
+ * 	addr:		starting virtual address of the region
  * 	size:		size of the region
- * 	Corrupts:	kaddr, size, tmp1, tmp2
+ * 	Corrupts:	addr, size, tmp1, tmp2
  */
-	.macro __dcache_op_workaround_clean_cache, op, kaddr
+	.macro __dcache_op_workaround_clean_cache, op, addr
 alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
-	dc	\op, \kaddr
+	dc	\op, \addr
 alternative_else
-	dc	civac, \kaddr
+	dc	civac, \addr
 alternative_endif
 	.endm
 
-	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	.macro dcache_by_line_op op, domain, addr, size, tmp1, tmp2
 	dcache_line_size \tmp1, \tmp2
-	add	\size, \kaddr, \size
+	add	\size, \addr, \size
 	sub	\tmp2, \tmp1, #1
-	bic	\kaddr, \kaddr, \tmp2
+	bic	\addr, \addr, \tmp2
 9998:
 	.ifc	\op, cvau
-	__dcache_op_workaround_clean_cache \op, \kaddr
+	__dcache_op_workaround_clean_cache \op, \addr
 	.else
 	.ifc	\op, cvac
-	__dcache_op_workaround_clean_cache \op, \kaddr
+	__dcache_op_workaround_clean_cache \op, \addr
 	.else
 	.ifc	\op, cvap
-	sys	3, c7, c12, 1, \kaddr	// dc cvap
+	sys	3, c7, c12, 1, \addr	// dc cvap
 	.else
 	.ifc	\op, cvadp
-	sys	3, c7, c13, 1, \kaddr	// dc cvadp
+	sys	3, c7, c13, 1, \addr	// dc cvadp
 	.else
-	dc	\op, \kaddr
+	dc	\op, \addr
 	.endif
 	.endif
 	.endif
 	.endif
-	add	\kaddr, \kaddr, \tmp1
-	cmp	\kaddr, \size
+	add	\addr, \addr, \tmp1
+	cmp	\addr, \size
 	b.lo	9998b
 	dsb	\domain
 	.endm
-- 
cgit v1.2.3


From d11b187760f52480dd83bda0429ee3c94e542b1d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 24 May 2021 09:29:45 +0100
Subject: arm64: assembler: add conditional cache fixups

It would be helpful if we could use both `dcache_by_line_op` and
`invalidate_icache_by_line` for user memory without accidentally fixing
up unexpected faults when performing maintenance on kernel addresses.

Let's make this possible by having both macros take an optional fixup
label, and only generating an extable entry if a label is provided.

At the same time, let's clean up the labels used to be globally unique
using \@ as we do for other macros.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Cc: Ard Biesheuvel <aedb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-3-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/assembler.h | 39 +++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 6a0fbc599196..0a276b46ef50 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -130,15 +130,27 @@ alternative_endif
 	.endm
 
 /*
- * Emit an entry into the exception table
+ * Create an exception table entry for `insn`, which will branch to `fixup`
+ * when an unhandled fault is taken.
  */
-	.macro		_asm_extable, from, to
+	.macro		_asm_extable, insn, fixup
 	.pushsection	__ex_table, "a"
 	.align		3
-	.long		(\from - .), (\to - .)
+	.long		(\insn - .), (\fixup - .)
 	.popsection
 	.endm
 
+/*
+ * Create an exception table entry for `insn` if `fixup` is provided. Otherwise
+ * do nothing.
+ */
+	.macro		_cond_extable, insn, fixup
+	.ifnc		\fixup,
+	_asm_extable	\insn, \fixup
+	.endif
+	.endm
+
+
 #define USER(l, x...)				\
 9999:	x;					\
 	_asm_extable	9999b, l
@@ -383,6 +395,7 @@ alternative_cb_end
  * 	domain:		domain used in dsb instruciton
  * 	addr:		starting virtual address of the region
  * 	size:		size of the region
+ * 	fixup:		optional label to branch to on user fault
  * 	Corrupts:	addr, size, tmp1, tmp2
  */
 	.macro __dcache_op_workaround_clean_cache, op, addr
@@ -393,12 +406,12 @@ alternative_else
 alternative_endif
 	.endm
 
-	.macro dcache_by_line_op op, domain, addr, size, tmp1, tmp2
+	.macro dcache_by_line_op op, domain, addr, size, tmp1, tmp2, fixup
 	dcache_line_size \tmp1, \tmp2
 	add	\size, \addr, \size
 	sub	\tmp2, \tmp1, #1
 	bic	\addr, \addr, \tmp2
-9998:
+.Ldcache_op\@:
 	.ifc	\op, cvau
 	__dcache_op_workaround_clean_cache \op, \addr
 	.else
@@ -418,8 +431,10 @@ alternative_endif
 	.endif
 	add	\addr, \addr, \tmp1
 	cmp	\addr, \size
-	b.lo	9998b
+	b.lo	.Ldcache_op\@
 	dsb	\domain
+
+	_cond_extable .Ldcache_op\@, \fixup
 	.endm
 
 /*
@@ -427,20 +442,22 @@ alternative_endif
  * [start, end)
  *
  * 	start, end:	virtual addresses describing the region
- *	label:		A label to branch to on user fault.
+ *	fixup:		optional label to branch to on user fault
  * 	Corrupts:	tmp1, tmp2
  */
-	.macro invalidate_icache_by_line start, end, tmp1, tmp2, label
+	.macro invalidate_icache_by_line start, end, tmp1, tmp2, fixup
 	icache_line_size \tmp1, \tmp2
 	sub	\tmp2, \tmp1, #1
 	bic	\tmp2, \start, \tmp2
-9997:
-USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
+.Licache_op\@:
+	ic	ivau, \tmp2			// invalidate I line PoU
 	add	\tmp2, \tmp2, \tmp1
 	cmp	\tmp2, \end
-	b.lo	9997b
+	b.lo	.Licache_op\@
 	dsb	ish
 	isb
+
+	_cond_extable .Licache_op\@, \fixup
 	.endm
 
 /*
-- 
cgit v1.2.3


From 46710cf1fcb6235388e8d80619cdf2c196ad554b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:46 +0100
Subject: arm64: Apply errata to swsusp_arch_suspend_exit

The Arm errata covered by ARM64_WORKAROUND_CLEAN_CACHE require
that "dc cvau" instructions get promoted to "dc civac".

Reported-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-4-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/hibernate-asm.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index 8ccca660034e..0ed2f72a6b94 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -91,7 +91,8 @@ SYM_CODE_START(swsusp_arch_suspend_exit)
 	raw_dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x10, x3
-2:	dc	cvau, x4	/* clean D line / unified line */
+2:	/* clean D line / unified line */
+alternative_insn "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
 	add	x4, x4, x2
 	cmp	x4, x1
 	b.lo	2b
-- 
cgit v1.2.3


From 116b7f559492b719ae4bd22ee773cb7fb046a736 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:47 +0100
Subject: arm64: Do not enable uaccess for flush_icache_range

__flush_icache_range works on kernel addresses, and doesn't need
uaccess. The existing code is a side-effect of its current
implementation with __flush_cache_user_range fallthrough.

Instead of fallthrough to share the code, use a common macro for
the two where the caller specifies an optional fixup label if
user access is needed. If provided, this label would be used to
generate an extable entry.

Simplify the code to use dcache_by_line_op, instead of
replicating much of its functionality.

No functional change intended.
Possible performance impact due to the reduced number of
instructions.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Will Deacon <will@kernel.org>
Reported-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/
Link: https://lore.kernel.org/linux-arm-kernel/20210521121846.GB1040@C02TD0UTHF1T.local/
Signed-off-by: Fuad Tabba <tabba@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-5-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/cache.S | 57 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 2d881f34dd9d..7c54bcbf5a36 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -14,6 +14,34 @@
 #include <asm/alternative.h>
 #include <asm/asm-uaccess.h>
 
+/*
+ *	__flush_cache_range(start,end) [fixup]
+ *
+ *	Ensure that the I and D caches are coherent within specified region.
+ *	This is typically used when code has been written to a memory region,
+ *	and will be executed.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ *	- fixup   - optional label to branch to on user fault
+ */
+.macro	__flush_cache_range, fixup
+alternative_if ARM64_HAS_CACHE_IDC
+	dsb     ishst
+	b       .Ldc_skip_\@
+alternative_else_nop_endif
+	mov     x2, x0
+	sub     x3, x1, x0
+	dcache_by_line_op cvau, ish, x2, x3, x4, x5, \fixup
+.Ldc_skip_\@:
+alternative_if ARM64_HAS_CACHE_DIC
+	isb
+	b	.Lic_skip_\@
+alternative_else_nop_endif
+	invalidate_icache_by_line x0, x1, x2, x3, \fixup
+.Lic_skip_\@:
+.endm
+
 /*
  *	flush_icache_range(start,end)
  *
@@ -25,7 +53,9 @@
  *	- end     - virtual end address of region
  */
 SYM_FUNC_START(__flush_icache_range)
-	/* FALLTHROUGH */
+	__flush_cache_range
+	ret
+SYM_FUNC_END(__flush_icache_range)
 
 /*
  *	__flush_cache_user_range(start,end)
@@ -39,34 +69,15 @@ SYM_FUNC_START(__flush_icache_range)
  */
 SYM_FUNC_START(__flush_cache_user_range)
 	uaccess_ttbr0_enable x2, x3, x4
-alternative_if ARM64_HAS_CACHE_IDC
-	dsb	ishst
-	b	7f
-alternative_else_nop_endif
-	dcache_line_size x2, x3
-	sub	x3, x2, #1
-	bic	x4, x0, x3
-1:
-user_alt 9f, "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
-	add	x4, x4, x2
-	cmp	x4, x1
-	b.lo	1b
-	dsb	ish
 
-7:
-alternative_if ARM64_HAS_CACHE_DIC
-	isb
-	b	8f
-alternative_else_nop_endif
-	invalidate_icache_by_line x0, x1, x2, x3, 9f
-8:	mov	x0, #0
+	__flush_cache_range 2f
+	mov	x0, xzr
 1:
 	uaccess_ttbr0_disable x1, x2
 	ret
-9:
+2:
 	mov	x0, #-EFAULT
 	b	1b
-SYM_FUNC_END(__flush_icache_range)
 SYM_FUNC_END(__flush_cache_user_range)
 
 /*
-- 
cgit v1.2.3


From 7908072da535dca52b3a011ed6e1f73534546b59 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:48 +0100
Subject: arm64: Do not enable uaccess for invalidate_icache_range

invalidate_icache_range() works on kernel addresses, and doesn't
need uaccess. Remove the code that toggles uaccess_ttbr0_enable,
as well as the code that emits an entry into the exception table
(via the macro invalidate_icache_by_line).

Changes return type of invalidate_icache_range() from int (which
used to indicate a fault) to void, since it doesn't need uaccess
and won't fault. Note that return value was never checked by any
of the callers.

No functional change intended.
Possible performance impact due to the reduced number of
instructions.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-6-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h |  2 +-
 arch/arm64/mm/cache.S               | 11 +----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 52e5c1623224..a586afa84172 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -57,7 +57,7 @@
  *		- size   - region size
  */
 extern void __flush_icache_range(unsigned long start, unsigned long end);
-extern int  invalidate_icache_range(unsigned long start, unsigned long end);
+extern void invalidate_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
 extern void __inval_dcache_area(void *addr, size_t len);
 extern void __clean_dcache_area_poc(void *addr, size_t len);
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 7c54bcbf5a36..14eac9d76d57 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -90,21 +90,12 @@ SYM_FUNC_END(__flush_cache_user_range)
  */
 SYM_FUNC_START(invalidate_icache_range)
 alternative_if ARM64_HAS_CACHE_DIC
-	mov	x0, xzr
 	isb
 	ret
 alternative_else_nop_endif
 
-	uaccess_ttbr0_enable x2, x3, x4
-
-	invalidate_icache_by_line x0, x1, x2, x3, 2f
-	mov	x0, xzr
-1:
-	uaccess_ttbr0_disable x1, x2
+	invalidate_icache_by_line x0, x1, x2, x3
 	ret
-2:
-	mov	x0, #-EFAULT
-	b	1b
 SYM_FUNC_END(invalidate_icache_range)
 
 /*
-- 
cgit v1.2.3


From 5e20e3499682c4f1724438d23afcafd473526a54 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:49 +0100
Subject: arm64: Downgrade flush_icache_range to invalidate

Since __flush_dcache_area is called right before,
invalidate_icache_range is sufficient in this case.

Rewrite the comment to better explain the rationale behind the
cache maintenance operations used here.

No functional change intended.
Possible performance impact due to invalidating only the icache
rather than invalidating and cleaning both caches.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-7-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/machine_kexec.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 90a335c74442..a03944fd0cd4 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -68,10 +68,14 @@ int machine_kexec_post_load(struct kimage *kimage)
 	kimage->arch.kern_reloc = __pa(reloc_code);
 	kexec_image_info(kimage);
 
-	/* Flush the reloc_code in preparation for its execution. */
+	/*
+	 * For execution with the MMU off, reloc_code needs to be cleaned to the
+	 * PoC and invalidated from the I-cache.
+	 */
 	__flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size);
-	flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code +
-			   arm64_relocate_new_kernel_size);
+	invalidate_icache_range((uintptr_t)reloc_code,
+				(uintptr_t)reloc_code +
+					arm64_relocate_new_kernel_size);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 55272ecc3ada8ec947bb5e94ee2fcde6cf31e166 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:50 +0100
Subject: arm64: assembler: remove user_alt

user_alt isn't being used anymore. It's also simpler and clearer
to directly use alternative_insn and _cond_extable in-line when
needed.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/linux-arm-kernel/20210520125735.GF17233@C02TD0UTHF1T.local/
Signed-off-by: Fuad Tabba <tabba@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-8-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/alternative-macros.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 8a078fc662ac..477703578caa 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -197,11 +197,6 @@ alternative_endif
 #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
 	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
 
-.macro user_alt, label, oldinstr, newinstr, cond
-9999:	alternative_insn "\oldinstr", "\newinstr", \cond
-	_asm_extable 9999b, \label
-.endm
-
 #endif  /*  __ASSEMBLY__  */
 
 /*
-- 
cgit v1.2.3


From 06b7a568ca5e9cb79a0cc4737f498ea90d8fa89d Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:51 +0100
Subject: arm64: Move documentation of dcache_by_line_op

The comment describing the macro dcache_by_line_op is placed
right before the previous macro of the one it describes, which is
a bit confusing. Move it to the macro it describes (dcache_by_line_op).

No functional change intended.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-9-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/assembler.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 0a276b46ef50..ced791124b28 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -387,6 +387,14 @@ alternative_cb_end
 	bfi	\tcr, \tmp0, \pos, #3
 	.endm
 
+	.macro __dcache_op_workaround_clean_cache, op, addr
+alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
+	dc	\op, \addr
+alternative_else
+	dc	civac, \addr
+alternative_endif
+	.endm
+
 /*
  * Macro to perform a data cache maintenance for the interval
  * [addr, addr + size)
@@ -398,14 +406,6 @@ alternative_cb_end
  * 	fixup:		optional label to branch to on user fault
  * 	Corrupts:	addr, size, tmp1, tmp2
  */
-	.macro __dcache_op_workaround_clean_cache, op, addr
-alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
-	dc	\op, \addr
-alternative_else
-	dc	civac, \addr
-alternative_endif
-	.endm
-
 	.macro dcache_by_line_op op, domain, addr, size, tmp1, tmp2, fixup
 	dcache_line_size \tmp1, \tmp2
 	add	\size, \addr, \size
-- 
cgit v1.2.3


From d044f8141847bee542998a6fd8de2c270fe40e48 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:52 +0100
Subject: arm64: Fix comments to refer to correct function __flush_icache_range

Many comments refer to the function flush_icache_range, where the
intent is in fact __flush_icache_range. Fix these comments to
refer to the intended function.

That's probably due to commit 3b8c9f1cdfc506e9 ("arm64: IPI each
CPU after invalidating the I-cache for kernel mappings"), which
renamed flush_icache_range() to __flush_icache_range() and added
a wrapper.

No functional change intended.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-10-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/hibernate-asm.S | 4 ++--
 arch/arm64/mm/cache.S             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index 0ed2f72a6b94..ef2ab7caf815 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -45,7 +45,7 @@
  * Because this code has to be copied to a 'safe' page, it can't call out to
  * other functions by PC-relative address. Also remember that it may be
  * mid-way through over-writing other functions. For this reason it contains
- * code from flush_icache_range() and uses the copy_page() macro.
+ * code from __flush_icache_range() and uses the copy_page() macro.
  *
  * This 'safe' page is mapped via ttbr0, and executed from there. This function
  * switches to a copy of the linear map in ttbr1, performs the restore, then
@@ -87,7 +87,7 @@ SYM_CODE_START(swsusp_arch_suspend_exit)
 	copy_page	x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
 
 	add	x1, x10, #PAGE_SIZE
-	/* Clean the copied page to PoU - based on flush_icache_range() */
+	/* Clean the copied page to PoU - based on __flush_icache_range() */
 	raw_dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x10, x3
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 14eac9d76d57..910ae8f6a389 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -43,7 +43,7 @@ alternative_else_nop_endif
 .endm
 
 /*
- *	flush_icache_range(start,end)
+ *	__flush_icache_range(start,end)
  *
  *	Ensure that the I and D caches are coherent within specified region.
  *	This is typically used when code has been written to a memory region,
-- 
cgit v1.2.3


From e3974adb4ef591e898956083a3dfa6336bb88638 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:53 +0100
Subject: arm64: __inval_dcache_area to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

Because the code is shared with __dma_inv_area, it changes the
parameters for that as well. However, __dma_inv_area is local to
cache.S, so no other users are affected.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-11-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h |  2 +-
 arch/arm64/kernel/head.S            |  5 +----
 arch/arm64/mm/cache.S               | 16 +++++++++-------
 arch/arm64/mm/flush.c               |  2 +-
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index a586afa84172..157234706817 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -59,7 +59,7 @@
 extern void __flush_icache_range(unsigned long start, unsigned long end);
 extern void invalidate_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
-extern void __inval_dcache_area(void *addr, size_t len);
+extern void __inval_dcache_area(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_poc(void *addr, size_t len);
 extern void __clean_dcache_area_pop(void *addr, size_t len);
 extern void __clean_dcache_area_pou(void *addr, size_t len);
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 96873dfa67fd..8df0ac8d9123 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -117,7 +117,7 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 	dmb	sy				// needed before dc ivac with
 						// MMU off
 
-	mov	x1, #0x20			// 4 x 8 bytes
+	add	x1, x0, #0x20			// 4 x 8 bytes
 	b	__inval_dcache_area		// tail call
 SYM_CODE_END(preserve_boot_args)
 
@@ -268,7 +268,6 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 	 */
 	adrp	x0, init_pg_dir
 	adrp	x1, init_pg_end
-	sub	x1, x1, x0
 	bl	__inval_dcache_area
 
 	/*
@@ -382,12 +381,10 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 
 	adrp	x0, idmap_pg_dir
 	adrp	x1, idmap_pg_end
-	sub	x1, x1, x0
 	bl	__inval_dcache_area
 
 	adrp	x0, init_pg_dir
 	adrp	x1, init_pg_end
-	sub	x1, x1, x0
 	bl	__inval_dcache_area
 
 	ret	x28
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 910ae8f6a389..03c1a7659ffb 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -131,25 +131,24 @@ alternative_else_nop_endif
 SYM_FUNC_END(__clean_dcache_area_pou)
 
 /*
- *	__inval_dcache_area(kaddr, size)
+ *	__inval_dcache_area(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are invalidated. Any partial lines at the ends of the interval are
  *	also cleaned to PoC to prevent data loss.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - kernel start address of region
+ *	- end     - kernel end address of region
  */
 SYM_FUNC_START_LOCAL(__dma_inv_area)
 SYM_FUNC_START_PI(__inval_dcache_area)
 	/* FALLTHROUGH */
 
 /*
- *	__dma_inv_area(start, size)
+ *	__dma_inv_area(start, end)
  *	- start   - virtual start address of region
- *	- size    - size in question
+ *	- end     - virtual end address of region
  */
-	add	x1, x1, x0
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	tst	x1, x3				// end cache line aligned?
@@ -230,8 +229,10 @@ SYM_FUNC_END_PI(__dma_flush_area)
  *	- dir	- DMA direction
  */
 SYM_FUNC_START_PI(__dma_map_area)
+	add	x1, x0, x1
 	cmp	w2, #DMA_FROM_DEVICE
 	b.eq	__dma_inv_area
+	sub	x1, x1, x0
 	b	__dma_clean_area
 SYM_FUNC_END_PI(__dma_map_area)
 
@@ -242,6 +243,7 @@ SYM_FUNC_END_PI(__dma_map_area)
  *	- dir	- DMA direction
  */
 SYM_FUNC_START_PI(__dma_unmap_area)
+	add	x1, x0, x1
 	cmp	w2, #DMA_TO_DEVICE
 	b.ne	__dma_inv_area
 	ret
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 6d44c028d1c9..be650b573b2a 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 
 void arch_invalidate_pmem(void *addr, size_t size)
 {
-	__inval_dcache_area(addr, size);
+	__inval_dcache_area((unsigned long)addr, (unsigned long)addr + size);
 }
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 #endif
-- 
cgit v1.2.3


From 163d3f80695e31068c7d32244c9e6d406d5c5c00 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:54 +0100
Subject: arm64: dcache_by_line_op to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-12-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/assembler.h | 27 +++++++++++++--------------
 arch/arm64/kvm/hyp/nvhe/cache.S    |  1 +
 arch/arm64/mm/cache.S              |  7 ++++++-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ced791124b28..c4cecf85dccf 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -397,40 +397,39 @@ alternative_endif
 
 /*
  * Macro to perform a data cache maintenance for the interval
- * [addr, addr + size)
+ * [start, end)
  *
  * 	op:		operation passed to dc instruction
  * 	domain:		domain used in dsb instruciton
- * 	addr:		starting virtual address of the region
- * 	size:		size of the region
+ * 	start:          starting virtual address of the region
+ * 	end:            end virtual address of the region
  * 	fixup:		optional label to branch to on user fault
- * 	Corrupts:	addr, size, tmp1, tmp2
+ * 	Corrupts:       start, end, tmp1, tmp2
  */
-	.macro dcache_by_line_op op, domain, addr, size, tmp1, tmp2, fixup
+	.macro dcache_by_line_op op, domain, start, end, tmp1, tmp2, fixup
 	dcache_line_size \tmp1, \tmp2
-	add	\size, \addr, \size
 	sub	\tmp2, \tmp1, #1
-	bic	\addr, \addr, \tmp2
+	bic	\start, \start, \tmp2
 .Ldcache_op\@:
 	.ifc	\op, cvau
-	__dcache_op_workaround_clean_cache \op, \addr
+	__dcache_op_workaround_clean_cache \op, \start
 	.else
 	.ifc	\op, cvac
-	__dcache_op_workaround_clean_cache \op, \addr
+	__dcache_op_workaround_clean_cache \op, \start
 	.else
 	.ifc	\op, cvap
-	sys	3, c7, c12, 1, \addr	// dc cvap
+	sys	3, c7, c12, 1, \start	// dc cvap
 	.else
 	.ifc	\op, cvadp
-	sys	3, c7, c13, 1, \addr	// dc cvadp
+	sys	3, c7, c13, 1, \start	// dc cvadp
 	.else
-	dc	\op, \addr
+	dc	\op, \start
 	.endif
 	.endif
 	.endif
 	.endif
-	add	\addr, \addr, \tmp1
-	cmp	\addr, \size
+	add	\start, \start, \tmp1
+	cmp	\start, \end
 	b.lo	.Ldcache_op\@
 	dsb	\domain
 
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 36cef6915428..3bcfa3cac46f 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -8,6 +8,7 @@
 #include <asm/alternative.h>
 
 SYM_FUNC_START_PI(__flush_dcache_area)
+	add	x1, x0, x1
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__flush_dcache_area)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 03c1a7659ffb..fff883f691f2 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -31,7 +31,7 @@ alternative_if ARM64_HAS_CACHE_IDC
 	b       .Ldc_skip_\@
 alternative_else_nop_endif
 	mov     x2, x0
-	sub     x3, x1, x0
+	mov     x3, x1
 	dcache_by_line_op cvau, ish, x2, x3, x4, x5, \fixup
 .Ldc_skip_\@:
 alternative_if ARM64_HAS_CACHE_DIC
@@ -108,6 +108,7 @@ SYM_FUNC_END(invalidate_icache_range)
  *	- size    - size in question
  */
 SYM_FUNC_START_PI(__flush_dcache_area)
+	add	x1, x0, x1
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__flush_dcache_area)
@@ -126,6 +127,7 @@ alternative_if ARM64_HAS_CACHE_IDC
 	dsb	ishst
 	ret
 alternative_else_nop_endif
+	add	x1, x0, x1
 	dcache_by_line_op cvau, ish, x0, x1, x2, x3
 	ret
 SYM_FUNC_END(__clean_dcache_area_pou)
@@ -187,6 +189,7 @@ SYM_FUNC_START_PI(__clean_dcache_area_poc)
  *	- start   - virtual start address of region
  *	- size    - size in question
  */
+	add	x1, x0, x1
 	dcache_by_line_op cvac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__clean_dcache_area_poc)
@@ -205,6 +208,7 @@ SYM_FUNC_START_PI(__clean_dcache_area_pop)
 	alternative_if_not ARM64_HAS_DCPOP
 	b	__clean_dcache_area_poc
 	alternative_else_nop_endif
+	add	x1, x0, x1
 	dcache_by_line_op cvap, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__clean_dcache_area_pop)
@@ -218,6 +222,7 @@ SYM_FUNC_END_PI(__clean_dcache_area_pop)
  *	- size    - size in question
  */
 SYM_FUNC_START_PI(__dma_flush_area)
+	add	x1, x0, x1
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__dma_flush_area)
-- 
cgit v1.2.3


From 814b186079cd54d3fe3b6b8ab539cbd44705ef9d Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:55 +0100
Subject: arm64: __flush_dcache_area to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-13-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/arch_gicv3.h |  3 ++-
 arch/arm64/include/asm/cacheflush.h |  8 ++++----
 arch/arm64/include/asm/efi.h        |  2 +-
 arch/arm64/include/asm/kvm_mmu.h    |  3 ++-
 arch/arm64/kernel/hibernate.c       | 18 +++++++++++-------
 arch/arm64/kernel/idreg-override.c  |  3 ++-
 arch/arm64/kernel/kaslr.c           | 12 +++++++++---
 arch/arm64/kernel/machine_kexec.c   | 20 +++++++++++++-------
 arch/arm64/kernel/smp.c             |  8 ++++++--
 arch/arm64/kernel/smp_spin_table.c  |  7 ++++---
 arch/arm64/kvm/hyp/nvhe/cache.S     |  1 -
 arch/arm64/kvm/hyp/nvhe/setup.c     |  3 ++-
 arch/arm64/kvm/hyp/pgtable.c        | 13 ++++++++++---
 arch/arm64/mm/cache.S               |  9 ++++-----
 14 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 934b9be582d2..ed1cc9d8e6df 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -124,7 +124,8 @@ static inline u32 gic_read_rpr(void)
 #define gic_read_lpir(c)		readq_relaxed(c)
 #define gic_write_lpir(v, c)		writeq_relaxed(v, c)
 
-#define gic_flush_dcache_to_poc(a,l)	__flush_dcache_area((a), (l))
+#define gic_flush_dcache_to_poc(a,l)	\
+	__flush_dcache_area((unsigned long)(a), (unsigned long)(a)+(l))
 
 #define gits_read_baser(c)		readq_relaxed(c)
 #define gits_write_baser(v, c)		writeq_relaxed(v, c)
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 157234706817..695f88864784 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -50,15 +50,15 @@
  *		- start  - virtual start address
  *		- end    - virtual end address
  *
- *	__flush_dcache_area(kaddr, size)
+ *	__flush_dcache_area(start, end)
  *
  *		Ensure that the data held in page is written back.
- *		- kaddr  - page address
- *		- size   - region size
+ *		- start  - virtual start address
+ *		- end    - virtual end address
  */
 extern void __flush_icache_range(unsigned long start, unsigned long end);
 extern void invalidate_icache_range(unsigned long start, unsigned long end);
-extern void __flush_dcache_area(void *addr, size_t len);
+extern void __flush_dcache_area(unsigned long start, unsigned long end);
 extern void __inval_dcache_area(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_poc(void *addr, size_t len);
 extern void __clean_dcache_area_pop(void *addr, size_t len);
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 3578aba9c608..0ae2397076fd 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -137,7 +137,7 @@ void efi_virtmap_unload(void);
 
 static inline void efi_capsule_flush_cache_range(void *addr, int size)
 {
-	__flush_dcache_area(addr, size);
+	__flush_dcache_area((unsigned long)addr, (unsigned long)addr + size);
 }
 
 #endif /* _ASM_EFI_H */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 25ed956f9af1..33293d5855af 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -180,7 +180,8 @@ static inline void *__kvm_vector_slot2addr(void *base,
 
 struct kvm;
 
-#define kvm_flush_dcache_to_poc(a,l)	__flush_dcache_area((a), (l))
+#define kvm_flush_dcache_to_poc(a,l)	\
+	__flush_dcache_area((unsigned long)(a), (unsigned long)(a)+(l))
 
 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index b1cef371df2b..b40ddce71507 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -240,8 +240,6 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	return 0;
 }
 
-#define dcache_clean_range(start, end)	__flush_dcache_area(start, (end - start))
-
 #ifdef CONFIG_ARM64_MTE
 
 static DEFINE_XARRAY(mte_pages);
@@ -383,13 +381,18 @@ int swsusp_arch_suspend(void)
 		ret = swsusp_save();
 	} else {
 		/* Clean kernel core startup/idle code to PoC*/
-		dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end);
-		dcache_clean_range(__idmap_text_start, __idmap_text_end);
+		__flush_dcache_area((unsigned long)__mmuoff_data_start,
+				    (unsigned long)__mmuoff_data_end);
+		__flush_dcache_area((unsigned long)__idmap_text_start,
+				    (unsigned long)__idmap_text_end);
 
 		/* Clean kvm setup code to PoC? */
 		if (el2_reset_needed()) {
-			dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end);
-			dcache_clean_range(__hyp_text_start, __hyp_text_end);
+			__flush_dcache_area(
+				(unsigned long)__hyp_idmap_text_start,
+				(unsigned long)__hyp_idmap_text_end);
+			__flush_dcache_area((unsigned long)__hyp_text_start,
+					    (unsigned long)__hyp_text_end);
 		}
 
 		swsusp_mte_restore_tags();
@@ -474,7 +477,8 @@ int swsusp_arch_resume(void)
 	 * The hibernate exit text contains a set of el2 vectors, that will
 	 * be executed at el2 with the mmu off in order to reload hyp-stub.
 	 */
-	__flush_dcache_area(hibernate_exit, exit_size);
+	__flush_dcache_area((unsigned long)hibernate_exit,
+			    (unsigned long)hibernate_exit + exit_size);
 
 	/*
 	 * KASLR will cause the el2 vectors to be in a different location in
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index e628c8ce1ffe..3dd515baf526 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -237,7 +237,8 @@ asmlinkage void __init init_feature_override(void)
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
 		if (regs[i]->override)
-			__flush_dcache_area(regs[i]->override,
+			__flush_dcache_area((unsigned long)regs[i]->override,
+					    (unsigned long)regs[i]->override +
 					    sizeof(*regs[i]->override));
 	}
 }
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 341342b207f6..49cccd03cb37 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -72,7 +72,9 @@ u64 __init kaslr_early_init(void)
 	 * we end up running with module randomization disabled.
 	 */
 	module_alloc_base = (u64)_etext - MODULES_VSIZE;
-	__flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base));
+	__flush_dcache_area((unsigned long)&module_alloc_base,
+			    (unsigned long)&module_alloc_base +
+				    sizeof(module_alloc_base));
 
 	/*
 	 * Try to map the FDT early. If this fails, we simply bail,
@@ -170,8 +172,12 @@ u64 __init kaslr_early_init(void)
 	module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
 	module_alloc_base &= PAGE_MASK;
 
-	__flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base));
-	__flush_dcache_area(&memstart_offset_seed, sizeof(memstart_offset_seed));
+	__flush_dcache_area((unsigned long)&module_alloc_base,
+			    (unsigned long)&module_alloc_base +
+				    sizeof(module_alloc_base));
+	__flush_dcache_area((unsigned long)&memstart_offset_seed,
+			    (unsigned long)&memstart_offset_seed +
+				    sizeof(memstart_offset_seed));
 
 	return offset;
 }
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a03944fd0cd4..3e79110c8f3a 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -72,7 +72,9 @@ int machine_kexec_post_load(struct kimage *kimage)
 	 * For execution with the MMU off, reloc_code needs to be cleaned to the
 	 * PoC and invalidated from the I-cache.
 	 */
-	__flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size);
+	__flush_dcache_area((unsigned long)reloc_code,
+			    (unsigned long)reloc_code +
+				    arm64_relocate_new_kernel_size);
 	invalidate_icache_range((uintptr_t)reloc_code,
 				(uintptr_t)reloc_code +
 					arm64_relocate_new_kernel_size);
@@ -106,16 +108,18 @@ static void kexec_list_flush(struct kimage *kimage)
 
 	for (entry = &kimage->head; ; entry++) {
 		unsigned int flag;
-		void *addr;
+		unsigned long addr;
 
 		/* flush the list entries. */
-		__flush_dcache_area(entry, sizeof(kimage_entry_t));
+		__flush_dcache_area((unsigned long)entry,
+				    (unsigned long)entry +
+					    sizeof(kimage_entry_t));
 
 		flag = *entry & IND_FLAGS;
 		if (flag == IND_DONE)
 			break;
 
-		addr = phys_to_virt(*entry & PAGE_MASK);
+		addr = (unsigned long)phys_to_virt(*entry & PAGE_MASK);
 
 		switch (flag) {
 		case IND_INDIRECTION:
@@ -124,7 +128,7 @@ static void kexec_list_flush(struct kimage *kimage)
 			break;
 		case IND_SOURCE:
 			/* flush the source pages. */
-			__flush_dcache_area(addr, PAGE_SIZE);
+			__flush_dcache_area(addr, addr + PAGE_SIZE);
 			break;
 		case IND_DESTINATION:
 			break;
@@ -151,8 +155,10 @@ static void kexec_segment_flush(const struct kimage *kimage)
 			kimage->segment[i].memsz,
 			kimage->segment[i].memsz /  PAGE_SIZE);
 
-		__flush_dcache_area(phys_to_virt(kimage->segment[i].mem),
-			kimage->segment[i].memsz);
+		__flush_dcache_area(
+			(unsigned long)phys_to_virt(kimage->segment[i].mem),
+			(unsigned long)phys_to_virt(kimage->segment[i].mem) +
+				kimage->segment[i].memsz);
 	}
 }
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dcd7041b2b07..5fcdee331087 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -122,7 +122,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	secondary_data.task = idle;
 	secondary_data.stack = task_stack_page(idle) + THREAD_SIZE;
 	update_cpu_boot_status(CPU_MMU_OFF);
-	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
+	__flush_dcache_area((unsigned long)&secondary_data,
+			    (unsigned long)&secondary_data +
+				    sizeof(secondary_data));
 
 	/* Now bring the CPU into our world */
 	ret = boot_secondary(cpu, idle);
@@ -143,7 +145,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	pr_crit("CPU%u: failed to come online\n", cpu);
 	secondary_data.task = NULL;
 	secondary_data.stack = NULL;
-	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
+	__flush_dcache_area((unsigned long)&secondary_data,
+			    (unsigned long)&secondary_data +
+				    sizeof(secondary_data));
 	status = READ_ONCE(secondary_data.status);
 	if (status == CPU_MMU_OFF)
 		status = READ_ONCE(__early_cpu_boot_status);
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index c45a83512805..58d804582a35 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -36,7 +36,7 @@ static void write_pen_release(u64 val)
 	unsigned long size = sizeof(secondary_holding_pen_release);
 
 	secondary_holding_pen_release = val;
-	__flush_dcache_area(start, size);
+	__flush_dcache_area((unsigned long)start, (unsigned long)start + size);
 }
 
 
@@ -90,8 +90,9 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
 	 * the boot protocol.
 	 */
 	writeq_relaxed(pa_holding_pen, release_addr);
-	__flush_dcache_area((__force void *)release_addr,
-			    sizeof(*release_addr));
+	__flush_dcache_area((__force unsigned long)release_addr,
+			    (__force unsigned long)release_addr +
+				    sizeof(*release_addr));
 
 	/*
 	 * Send an event to wake up the secondary CPU.
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 3bcfa3cac46f..36cef6915428 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -8,7 +8,6 @@
 #include <asm/alternative.h>
 
 SYM_FUNC_START_PI(__flush_dcache_area)
-	add	x1, x0, x1
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__flush_dcache_area)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 7488f53b0aa2..5dffe928f256 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -134,7 +134,8 @@ static void update_nvhe_init_params(void)
 	for (i = 0; i < hyp_nr_cpus; i++) {
 		params = per_cpu_ptr(&kvm_init_params, i);
 		params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
-		__flush_dcache_area(params, sizeof(*params));
+		__flush_dcache_area((unsigned long)params,
+				    (unsigned long)params + sizeof(*params));
 	}
 }
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c37c1dc4feaf..10d2f04013d4 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -839,8 +839,11 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	stage2_put_pte(ptep, mmu, addr, level, mm_ops);
 
 	if (need_flush) {
-		__flush_dcache_area(kvm_pte_follow(pte, mm_ops),
-				    kvm_granule_size(level));
+		kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
+
+		__flush_dcache_area((unsigned long)pte_follow,
+				    (unsigned long)pte_follow +
+					    kvm_granule_size(level));
 	}
 
 	if (childp)
@@ -988,11 +991,15 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	struct kvm_pgtable *pgt = arg;
 	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
 	kvm_pte_t pte = *ptep;
+	kvm_pte_t *pte_follow;
 
 	if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
 		return 0;
 
-	__flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level));
+	pte_follow = kvm_pte_follow(pte, mm_ops);
+	__flush_dcache_area((unsigned long)pte_follow,
+			    (unsigned long)pte_follow +
+				    kvm_granule_size(level));
 	return 0;
 }
 
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index fff883f691f2..b2880aeba7ca 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -99,16 +99,15 @@ alternative_else_nop_endif
 SYM_FUNC_END(invalidate_icache_range)
 
 /*
- *	__flush_dcache_area(kaddr, size)
+ *	__flush_dcache_area(start, end)
  *
- *	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ *	Ensure that any D-cache lines for the interval [start, end)
  *	are cleaned and invalidated to the PoC.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
 SYM_FUNC_START_PI(__flush_dcache_area)
-	add	x1, x0, x1
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__flush_dcache_area)
-- 
cgit v1.2.3


From 1f42faf1d25de2ae239f322fda8af1c92c20e953 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:56 +0100
Subject: arm64: __clean_dcache_area_poc to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

Because the code is shared with __dma_clean_area, it changes the
parameters for that as well. However, __dma_clean_area is local to
cache.S, so no other users are affected.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-14-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h |  2 +-
 arch/arm64/kernel/efi-entry.S       |  5 +++--
 arch/arm64/mm/cache.S               | 16 +++++++---------
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 695f88864784..3255878d6f30 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -60,7 +60,7 @@ extern void __flush_icache_range(unsigned long start, unsigned long end);
 extern void invalidate_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(unsigned long start, unsigned long end);
 extern void __inval_dcache_area(unsigned long start, unsigned long end);
-extern void __clean_dcache_area_poc(void *addr, size_t len);
+extern void __clean_dcache_area_poc(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_pop(void *addr, size_t len);
 extern void __clean_dcache_area_pou(void *addr, size_t len);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index 0073b24b5d25..b0f728fb61f0 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -28,6 +28,7 @@ SYM_CODE_START(efi_enter_kernel)
 	 * stale icache entries from before relocation.
 	 */
 	ldr	w1, =kernel_size
+	add	x1, x0, x1
 	bl	__clean_dcache_area_poc
 	ic	ialluis
 
@@ -36,7 +37,7 @@ SYM_CODE_START(efi_enter_kernel)
 	 * so that we can safely disable the MMU and caches.
 	 */
 	adr	x0, 0f
-	ldr	w1, 3f
+	adr	x1, 3f
 	bl	__clean_dcache_area_poc
 0:
 	/* Turn off Dcache and MMU */
@@ -64,5 +65,5 @@ SYM_CODE_START(efi_enter_kernel)
 	mov	x2, xzr
 	mov	x3, xzr
 	br	x19
+3:
 SYM_CODE_END(efi_enter_kernel)
-3:	.long	. - 0b
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index b2880aeba7ca..e2e2740c55ce 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -171,24 +171,23 @@ SYM_FUNC_END_PI(__inval_dcache_area)
 SYM_FUNC_END(__dma_inv_area)
 
 /*
- *	__clean_dcache_area_poc(kaddr, size)
+ *	__clean_dcache_area_poc(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoC.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
 SYM_FUNC_START_LOCAL(__dma_clean_area)
 SYM_FUNC_START_PI(__clean_dcache_area_poc)
 	/* FALLTHROUGH */
 
 /*
- *	__dma_clean_area(start, size)
+ *	__dma_clean_area(start, end)
  *	- start   - virtual start address of region
- *	- size    - size in question
+ *	- end     - virtual end address of region
  */
-	add	x1, x0, x1
 	dcache_by_line_op cvac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__clean_dcache_area_poc)
@@ -204,10 +203,10 @@ SYM_FUNC_END(__dma_clean_area)
  *	- size    - size in question
  */
 SYM_FUNC_START_PI(__clean_dcache_area_pop)
+	add	x1, x0, x1
 	alternative_if_not ARM64_HAS_DCPOP
 	b	__clean_dcache_area_poc
 	alternative_else_nop_endif
-	add	x1, x0, x1
 	dcache_by_line_op cvap, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__clean_dcache_area_pop)
@@ -236,7 +235,6 @@ SYM_FUNC_START_PI(__dma_map_area)
 	add	x1, x0, x1
 	cmp	w2, #DMA_FROM_DEVICE
 	b.eq	__dma_inv_area
-	sub	x1, x1, x0
 	b	__dma_clean_area
 SYM_FUNC_END_PI(__dma_map_area)
 
-- 
cgit v1.2.3


From f749448edb9c98bece0aeec5536260a8794af24b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:57 +0100
Subject: arm64: __clean_dcache_area_pop to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-15-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h | 2 +-
 arch/arm64/lib/uaccess_flushcache.c | 4 ++--
 arch/arm64/mm/cache.S               | 9 ++++-----
 arch/arm64/mm/flush.c               | 2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 3255878d6f30..fa5641868d65 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -61,7 +61,7 @@ extern void invalidate_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(unsigned long start, unsigned long end);
 extern void __inval_dcache_area(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_poc(unsigned long start, unsigned long end);
-extern void __clean_dcache_area_pop(void *addr, size_t len);
+extern void __clean_dcache_area_pop(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_pou(void *addr, size_t len);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
 extern void sync_icache_aliases(void *kaddr, unsigned long len);
diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c
index c83bb5a4aad2..62ea989effe8 100644
--- a/arch/arm64/lib/uaccess_flushcache.c
+++ b/arch/arm64/lib/uaccess_flushcache.c
@@ -15,7 +15,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt)
 	 * barrier to order the cache maintenance against the memcpy.
 	 */
 	memcpy(dst, src, cnt);
-	__clean_dcache_area_pop(dst, cnt);
+	__clean_dcache_area_pop((unsigned long)dst, (unsigned long)dst + cnt);
 }
 EXPORT_SYMBOL_GPL(memcpy_flushcache);
 
@@ -33,6 +33,6 @@ unsigned long __copy_user_flushcache(void *to, const void __user *from,
 	rc = raw_copy_from_user(to, from, n);
 
 	/* See above */
-	__clean_dcache_area_pop(to, n - rc);
+	__clean_dcache_area_pop((unsigned long)to, (unsigned long)to + n - rc);
 	return rc;
 }
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index e2e2740c55ce..b71fcf56516b 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -194,16 +194,15 @@ SYM_FUNC_END_PI(__clean_dcache_area_poc)
 SYM_FUNC_END(__dma_clean_area)
 
 /*
- *	__clean_dcache_area_pop(kaddr, size)
+ *	__clean_dcache_area_pop(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoP.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
 SYM_FUNC_START_PI(__clean_dcache_area_pop)
-	add	x1, x0, x1
 	alternative_if_not ARM64_HAS_DCPOP
 	b	__clean_dcache_area_poc
 	alternative_else_nop_endif
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index be650b573b2a..b2c226d93ca5 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -84,7 +84,7 @@ void arch_wb_cache_pmem(void *addr, size_t size)
 {
 	/* Ensure order against any prior non-cacheable writes */
 	dmb(osh);
-	__clean_dcache_area_pop(addr, size);
+	__clean_dcache_area_pop((unsigned long)addr, (unsigned long)addr + size);
 }
 EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 
-- 
cgit v1.2.3


From 406d7d4e2bc76d38a6dc88733a0f72fabf02d305 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:58 +0100
Subject: arm64: __clean_dcache_area_pou to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-16-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h | 2 +-
 arch/arm64/mm/cache.S               | 9 ++++-----
 arch/arm64/mm/flush.c               | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index fa5641868d65..f86723047315 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -62,7 +62,7 @@ extern void __flush_dcache_area(unsigned long start, unsigned long end);
 extern void __inval_dcache_area(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_poc(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_pop(unsigned long start, unsigned long end);
-extern void __clean_dcache_area_pou(void *addr, size_t len);
+extern void __clean_dcache_area_pou(unsigned long start, unsigned long end);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
 extern void sync_icache_aliases(void *kaddr, unsigned long len);
 
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index b71fcf56516b..ea605d94182f 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -113,20 +113,19 @@ SYM_FUNC_START_PI(__flush_dcache_area)
 SYM_FUNC_END_PI(__flush_dcache_area)
 
 /*
- *	__clean_dcache_area_pou(kaddr, size)
+ *	__clean_dcache_area_pou(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoU.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
 SYM_FUNC_START(__clean_dcache_area_pou)
 alternative_if ARM64_HAS_CACHE_IDC
 	dsb	ishst
 	ret
 alternative_else_nop_endif
-	add	x1, x0, x1
 	dcache_by_line_op cvau, ish, x0, x1, x2, x3
 	ret
 SYM_FUNC_END(__clean_dcache_area_pou)
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index b2c226d93ca5..0341bcc6fdf3 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -19,7 +19,7 @@ void sync_icache_aliases(void *kaddr, unsigned long len)
 	unsigned long addr = (unsigned long)kaddr;
 
 	if (icache_is_aliasing()) {
-		__clean_dcache_area_pou(kaddr, len);
+		__clean_dcache_area_pou(kaddr, kaddr + len);
 		__flush_icache_all();
 	} else {
 		/*
-- 
cgit v1.2.3


From 8c28d52ccd1d6e3a5aca8a37e465a5f8b77edbc1 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:29:59 +0100
Subject: arm64: sync_icache_aliases to take end parameter instead of size

To be consistent with other functions with similar names and
functionality in cacheflush.h, cache.S, and cachetlb.rst, change
to specify the range in terms of start and end, as opposed to
start and size.

No functional change intended.

Reported-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-17-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h |  2 +-
 arch/arm64/kernel/probes/uprobes.c  |  2 +-
 arch/arm64/mm/flush.c               | 21 ++++++++++-----------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index f86723047315..70b389a8dea5 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -64,7 +64,7 @@ extern void __clean_dcache_area_poc(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_pop(unsigned long start, unsigned long end);
 extern void __clean_dcache_area_pou(unsigned long start, unsigned long end);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
-extern void sync_icache_aliases(void *kaddr, unsigned long len);
+extern void sync_icache_aliases(unsigned long start, unsigned long end);
 
 static inline void flush_icache_range(unsigned long start, unsigned long end)
 {
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index 2c247634552b..9be668f3f034 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -21,7 +21,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 	memcpy(dst, src, len);
 
 	/* flush caches (dcache/icache) */
-	sync_icache_aliases(dst, len);
+	sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len);
 
 	kunmap_atomic(xol_page_kaddr);
 }
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 0341bcc6fdf3..c4ca7e05fdb8 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -14,28 +14,25 @@
 #include <asm/cache.h>
 #include <asm/tlbflush.h>
 
-void sync_icache_aliases(void *kaddr, unsigned long len)
+void sync_icache_aliases(unsigned long start, unsigned long end)
 {
-	unsigned long addr = (unsigned long)kaddr;
-
 	if (icache_is_aliasing()) {
-		__clean_dcache_area_pou(kaddr, kaddr + len);
+		__clean_dcache_area_pou(start, end);
 		__flush_icache_all();
 	} else {
 		/*
 		 * Don't issue kick_all_cpus_sync() after I-cache invalidation
 		 * for user mappings.
 		 */
-		__flush_icache_range(addr, addr + len);
+		__flush_icache_range(start, end);
 	}
 }
 
-static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
-				unsigned long uaddr, void *kaddr,
-				unsigned long len)
+static void flush_ptrace_access(struct vm_area_struct *vma, unsigned long start,
+				unsigned long end)
 {
 	if (vma->vm_flags & VM_EXEC)
-		sync_icache_aliases(kaddr, len);
+		sync_icache_aliases(start, end);
 }
 
 /*
@@ -48,7 +45,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 		       unsigned long len)
 {
 	memcpy(dst, src, len);
-	flush_ptrace_access(vma, page, uaddr, dst, len);
+	flush_ptrace_access(vma, (unsigned long)dst, (unsigned long)dst + len);
 }
 
 void __sync_icache_dcache(pte_t pte)
@@ -56,7 +53,9 @@ void __sync_icache_dcache(pte_t pte)
 	struct page *page = pte_page(pte);
 
 	if (!test_bit(PG_dcache_clean, &page->flags)) {
-		sync_icache_aliases(page_address(page), page_size(page));
+		sync_icache_aliases((unsigned long)page_address(page),
+				    (unsigned long)page_address(page) +
+					    page_size(page));
 		set_bit(PG_dcache_clean, &page->flags);
 	}
 }
-- 
cgit v1.2.3


From 393239be1ba69dcd29be504ffe14938509795821 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:30:00 +0100
Subject: arm64: Fix cache maintenance function comments

Fix and expand comments for the cache maintenance functions in
cacheflush.h. Adds comments to functions that weren't described
before. Explains what the functions do using Arm Architecture
Reference Manual terminology.

No functional change intended.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-18-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cacheflush.h | 47 +++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 70b389a8dea5..26617df1fa45 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -30,31 +30,44 @@
  *	the implementation assumes non-aliasing VIPT D-cache and (aliasing)
  *	VIPT I-cache.
  *
- *	flush_icache_range(start, end)
+ *	All functions below apply to the interval [start, end)
+ *		- start  - virtual start address (inclusive)
+ *		- end    - virtual end address (exclusive)
  *
- *		Ensure coherency between the I-cache and the D-cache in the
- *		region described by start, end.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *	__flush_icache_range(start, end)
  *
- *	invalidate_icache_range(start, end)
- *
- *		Invalidate the I-cache in the region described by start, end.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *		Ensure coherency between the I-cache and the D-cache region to
+ *		the Point of Unification.
  *
  *	__flush_cache_user_range(start, end)
  *
- *		Ensure coherency between the I-cache and the D-cache in the
- *		region described by start, end.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *		Ensure coherency between the I-cache and the D-cache region to
+ *		the Point of Unification.
+ *		Use only if the region might access user memory.
+ *
+ *	invalidate_icache_range(start, end)
+ *
+ *		Invalidate I-cache region to the Point of Unification.
  *
  *	__flush_dcache_area(start, end)
  *
- *		Ensure that the data held in page is written back.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *		Clean and invalidate D-cache region to the Point of Coherency.
+ *
+ *	__inval_dcache_area(start, end)
+ *
+ *		Invalidate D-cache region to the Point of Coherency.
+ *
+ *	__clean_dcache_area_poc(start, end)
+ *
+ *		Clean D-cache region to the Point of Coherency.
+ *
+ *	__clean_dcache_area_pop(start, end)
+ *
+ *		Clean D-cache region to the Point of Persistence.
+ *
+ *	__clean_dcache_area_pou(start, end)
+ *
+ *		Clean D-cache region to the Point of Unification.
  */
 extern void __flush_icache_range(unsigned long start, unsigned long end);
 extern void invalidate_icache_range(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From fade9c2c6ee2baea7df8e6059b3f143c681e5ce4 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 24 May 2021 09:30:01 +0100
Subject: arm64: Rename arm64-internal cache maintenance functions

Although naming across the codebase isn't that consistent, it
tends to follow certain patterns. Moreover, the term "flush"
isn't defined in the Arm Architecture reference manual, and might
be interpreted to mean clean, invalidate, or both for a cache.

Rename arm64-internal functions to make the naming internally
consistent, as well as making it consistent with the Arm ARM, by
specifying whether it applies to the instruction, data, or both
caches, whether the operation is a clean, invalidate, or both.
Also specify which point the operation applies to, i.e., to the
point of unification (PoU), coherency (PoC), or persistence
(PoP).

This commit applies the following sed transformation to all files
under arch/arm64:

"s/\b__flush_cache_range\b/caches_clean_inval_pou_macro/g;"\
"s/\b__flush_icache_range\b/caches_clean_inval_pou/g;"\
"s/\binvalidate_icache_range\b/icache_inval_pou/g;"\
"s/\b__flush_dcache_area\b/dcache_clean_inval_poc/g;"\
"s/\b__inval_dcache_area\b/dcache_inval_poc/g;"\
"s/__clean_dcache_area_poc\b/dcache_clean_poc/g;"\
"s/\b__clean_dcache_area_pop\b/dcache_clean_pop/g;"\
"s/\b__clean_dcache_area_pou\b/dcache_clean_pou/g;"\
"s/\b__flush_cache_user_range\b/caches_clean_inval_user_pou/g;"\
"s/\b__flush_icache_all\b/icache_inval_all_pou/g;"

Note that __clean_dcache_area_poc is deliberately missing a word
boundary check at the beginning in order to match the efistub
symbols in image-vars.h.

Also note that, despite its name, __flush_icache_range operates
on both instruction and data caches. The name change here
reflects that.

No functional change intended.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-19-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/arch_gicv3.h |  2 +-
 arch/arm64/include/asm/cacheflush.h | 36 +++++++++++------------
 arch/arm64/include/asm/efi.h        |  2 +-
 arch/arm64/include/asm/kvm_mmu.h    |  6 ++--
 arch/arm64/kernel/alternative.c     |  2 +-
 arch/arm64/kernel/efi-entry.S       |  4 +--
 arch/arm64/kernel/head.S            |  8 ++---
 arch/arm64/kernel/hibernate-asm.S   |  4 +--
 arch/arm64/kernel/hibernate.c       | 12 ++++----
 arch/arm64/kernel/idreg-override.c  |  2 +-
 arch/arm64/kernel/image-vars.h      |  2 +-
 arch/arm64/kernel/insn.c            |  2 +-
 arch/arm64/kernel/kaslr.c           |  6 ++--
 arch/arm64/kernel/machine_kexec.c   | 10 +++----
 arch/arm64/kernel/smp.c             |  4 +--
 arch/arm64/kernel/smp_spin_table.c  |  4 +--
 arch/arm64/kernel/sys_compat.c      |  2 +-
 arch/arm64/kvm/arm.c                |  2 +-
 arch/arm64/kvm/hyp/nvhe/cache.S     |  4 +--
 arch/arm64/kvm/hyp/nvhe/setup.c     |  2 +-
 arch/arm64/kvm/hyp/nvhe/tlb.c       |  2 +-
 arch/arm64/kvm/hyp/pgtable.c        |  4 +--
 arch/arm64/lib/uaccess_flushcache.c |  4 +--
 arch/arm64/mm/cache.S               | 58 ++++++++++++++++++-------------------
 arch/arm64/mm/flush.c               | 12 ++++----
 25 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index ed1cc9d8e6df..4ad22c3135db 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -125,7 +125,7 @@ static inline u32 gic_read_rpr(void)
 #define gic_write_lpir(v, c)		writeq_relaxed(v, c)
 
 #define gic_flush_dcache_to_poc(a,l)	\
-	__flush_dcache_area((unsigned long)(a), (unsigned long)(a)+(l))
+	dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
 
 #define gits_read_baser(c)		readq_relaxed(c)
 #define gits_write_baser(v, c)		writeq_relaxed(v, c)
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 26617df1fa45..543c997eb3b7 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -34,54 +34,54 @@
  *		- start  - virtual start address (inclusive)
  *		- end    - virtual end address (exclusive)
  *
- *	__flush_icache_range(start, end)
+ *	caches_clean_inval_pou(start, end)
  *
  *		Ensure coherency between the I-cache and the D-cache region to
  *		the Point of Unification.
  *
- *	__flush_cache_user_range(start, end)
+ *	caches_clean_inval_user_pou(start, end)
  *
  *		Ensure coherency between the I-cache and the D-cache region to
  *		the Point of Unification.
  *		Use only if the region might access user memory.
  *
- *	invalidate_icache_range(start, end)
+ *	icache_inval_pou(start, end)
  *
  *		Invalidate I-cache region to the Point of Unification.
  *
- *	__flush_dcache_area(start, end)
+ *	dcache_clean_inval_poc(start, end)
  *
  *		Clean and invalidate D-cache region to the Point of Coherency.
  *
- *	__inval_dcache_area(start, end)
+ *	dcache_inval_poc(start, end)
  *
  *		Invalidate D-cache region to the Point of Coherency.
  *
- *	__clean_dcache_area_poc(start, end)
+ *	dcache_clean_poc(start, end)
  *
  *		Clean D-cache region to the Point of Coherency.
  *
- *	__clean_dcache_area_pop(start, end)
+ *	dcache_clean_pop(start, end)
  *
  *		Clean D-cache region to the Point of Persistence.
  *
- *	__clean_dcache_area_pou(start, end)
+ *	dcache_clean_pou(start, end)
  *
  *		Clean D-cache region to the Point of Unification.
  */
-extern void __flush_icache_range(unsigned long start, unsigned long end);
-extern void invalidate_icache_range(unsigned long start, unsigned long end);
-extern void __flush_dcache_area(unsigned long start, unsigned long end);
-extern void __inval_dcache_area(unsigned long start, unsigned long end);
-extern void __clean_dcache_area_poc(unsigned long start, unsigned long end);
-extern void __clean_dcache_area_pop(unsigned long start, unsigned long end);
-extern void __clean_dcache_area_pou(unsigned long start, unsigned long end);
-extern long __flush_cache_user_range(unsigned long start, unsigned long end);
+extern void caches_clean_inval_pou(unsigned long start, unsigned long end);
+extern void icache_inval_pou(unsigned long start, unsigned long end);
+extern void dcache_clean_inval_poc(unsigned long start, unsigned long end);
+extern void dcache_inval_poc(unsigned long start, unsigned long end);
+extern void dcache_clean_poc(unsigned long start, unsigned long end);
+extern void dcache_clean_pop(unsigned long start, unsigned long end);
+extern void dcache_clean_pou(unsigned long start, unsigned long end);
+extern long caches_clean_inval_user_pou(unsigned long start, unsigned long end);
 extern void sync_icache_aliases(unsigned long start, unsigned long end);
 
 static inline void flush_icache_range(unsigned long start, unsigned long end)
 {
-	__flush_icache_range(start, end);
+	caches_clean_inval_pou(start, end);
 
 	/*
 	 * IPI all online CPUs so that they undergo a context synchronization
@@ -135,7 +135,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *,
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);
 
-static __always_inline void __flush_icache_all(void)
+static __always_inline void icache_inval_all_pou(void)
 {
 	if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
 		return;
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 0ae2397076fd..1bed37eb013a 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -137,7 +137,7 @@ void efi_virtmap_unload(void);
 
 static inline void efi_capsule_flush_cache_range(void *addr, int size)
 {
-	__flush_dcache_area((unsigned long)addr, (unsigned long)addr + size);
+	dcache_clean_inval_poc((unsigned long)addr, (unsigned long)addr + size);
 }
 
 #endif /* _ASM_EFI_H */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 33293d5855af..f4cbfa9025a8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -181,7 +181,7 @@ static inline void *__kvm_vector_slot2addr(void *base,
 struct kvm;
 
 #define kvm_flush_dcache_to_poc(a,l)	\
-	__flush_dcache_area((unsigned long)(a), (unsigned long)(a)+(l))
+	dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
 
 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
@@ -209,12 +209,12 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
 {
 	if (icache_is_aliasing()) {
 		/* any kind of VIPT cache */
-		__flush_icache_all();
+		icache_inval_all_pou();
 	} else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
 		/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
 		void *va = page_address(pfn_to_page(pfn));
 
-		invalidate_icache_range((unsigned long)va,
+		icache_inval_pou((unsigned long)va,
 					(unsigned long)va + size);
 	}
 }
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index c906d20c7b52..3fb79b76e9d9 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -181,7 +181,7 @@ static void __nocfi __apply_alternatives(struct alt_region *region, bool is_modu
 	 */
 	if (!is_module) {
 		dsb(ish);
-		__flush_icache_all();
+		icache_inval_all_pou();
 		isb();
 
 		/* Ignore ARM64_CB bit from feature mask */
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index b0f728fb61f0..61a87fa1c305 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -29,7 +29,7 @@ SYM_CODE_START(efi_enter_kernel)
 	 */
 	ldr	w1, =kernel_size
 	add	x1, x0, x1
-	bl	__clean_dcache_area_poc
+	bl	dcache_clean_poc
 	ic	ialluis
 
 	/*
@@ -38,7 +38,7 @@ SYM_CODE_START(efi_enter_kernel)
 	 */
 	adr	x0, 0f
 	adr	x1, 3f
-	bl	__clean_dcache_area_poc
+	bl	dcache_clean_poc
 0:
 	/* Turn off Dcache and MMU */
 	mrs	x0, CurrentEL
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 8df0ac8d9123..6928cb67d3a0 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -118,7 +118,7 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 						// MMU off
 
 	add	x1, x0, #0x20			// 4 x 8 bytes
-	b	__inval_dcache_area		// tail call
+	b	dcache_inval_poc		// tail call
 SYM_CODE_END(preserve_boot_args)
 
 /*
@@ -268,7 +268,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 	 */
 	adrp	x0, init_pg_dir
 	adrp	x1, init_pg_end
-	bl	__inval_dcache_area
+	bl	dcache_inval_poc
 
 	/*
 	 * Clear the init page tables.
@@ -381,11 +381,11 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 
 	adrp	x0, idmap_pg_dir
 	adrp	x1, idmap_pg_end
-	bl	__inval_dcache_area
+	bl	dcache_inval_poc
 
 	adrp	x0, init_pg_dir
 	adrp	x1, init_pg_end
-	bl	__inval_dcache_area
+	bl	dcache_inval_poc
 
 	ret	x28
 SYM_FUNC_END(__create_page_tables)
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index ef2ab7caf815..81c0186a5e32 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -45,7 +45,7 @@
  * Because this code has to be copied to a 'safe' page, it can't call out to
  * other functions by PC-relative address. Also remember that it may be
  * mid-way through over-writing other functions. For this reason it contains
- * code from __flush_icache_range() and uses the copy_page() macro.
+ * code from caches_clean_inval_pou() and uses the copy_page() macro.
  *
  * This 'safe' page is mapped via ttbr0, and executed from there. This function
  * switches to a copy of the linear map in ttbr1, performs the restore, then
@@ -87,7 +87,7 @@ SYM_CODE_START(swsusp_arch_suspend_exit)
 	copy_page	x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
 
 	add	x1, x10, #PAGE_SIZE
-	/* Clean the copied page to PoU - based on __flush_icache_range() */
+	/* Clean the copied page to PoU - based on caches_clean_inval_pou() */
 	raw_dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x10, x3
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index b40ddce71507..46a0b4d6e251 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -210,7 +210,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
 		return -ENOMEM;
 
 	memcpy(page, src_start, length);
-	__flush_icache_range((unsigned long)page, (unsigned long)page + length);
+	caches_clean_inval_pou((unsigned long)page, (unsigned long)page + length);
 	rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page);
 	if (rc)
 		return rc;
@@ -381,17 +381,17 @@ int swsusp_arch_suspend(void)
 		ret = swsusp_save();
 	} else {
 		/* Clean kernel core startup/idle code to PoC*/
-		__flush_dcache_area((unsigned long)__mmuoff_data_start,
+		dcache_clean_inval_poc((unsigned long)__mmuoff_data_start,
 				    (unsigned long)__mmuoff_data_end);
-		__flush_dcache_area((unsigned long)__idmap_text_start,
+		dcache_clean_inval_poc((unsigned long)__idmap_text_start,
 				    (unsigned long)__idmap_text_end);
 
 		/* Clean kvm setup code to PoC? */
 		if (el2_reset_needed()) {
-			__flush_dcache_area(
+			dcache_clean_inval_poc(
 				(unsigned long)__hyp_idmap_text_start,
 				(unsigned long)__hyp_idmap_text_end);
-			__flush_dcache_area((unsigned long)__hyp_text_start,
+			dcache_clean_inval_poc((unsigned long)__hyp_text_start,
 					    (unsigned long)__hyp_text_end);
 		}
 
@@ -477,7 +477,7 @@ int swsusp_arch_resume(void)
 	 * The hibernate exit text contains a set of el2 vectors, that will
 	 * be executed at el2 with the mmu off in order to reload hyp-stub.
 	 */
-	__flush_dcache_area((unsigned long)hibernate_exit,
+	dcache_clean_inval_poc((unsigned long)hibernate_exit,
 			    (unsigned long)hibernate_exit + exit_size);
 
 	/*
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index 3dd515baf526..53a381a7f65d 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -237,7 +237,7 @@ asmlinkage void __init init_feature_override(void)
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
 		if (regs[i]->override)
-			__flush_dcache_area((unsigned long)regs[i]->override,
+			dcache_clean_inval_poc((unsigned long)regs[i]->override,
 					    (unsigned long)regs[i]->override +
 					    sizeof(*regs[i]->override));
 	}
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index bcf3c2755370..c96a9a0043bf 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -35,7 +35,7 @@ __efistub_strnlen		= __pi_strnlen;
 __efistub_strcmp		= __pi_strcmp;
 __efistub_strncmp		= __pi_strncmp;
 __efistub_strrchr		= __pi_strrchr;
-__efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
+__efistub_dcache_clean_poc = __pi_dcache_clean_poc;
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 __efistub___memcpy		= __pi_memcpy;
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 6c0de2f60ea9..51cb8dc98d00 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -198,7 +198,7 @@ int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
 
 	ret = aarch64_insn_write(tp, insn);
 	if (ret == 0)
-		__flush_icache_range((uintptr_t)tp,
+		caches_clean_inval_pou((uintptr_t)tp,
 				     (uintptr_t)tp + AARCH64_INSN_SIZE);
 
 	return ret;
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 49cccd03cb37..cfa2cfde3019 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -72,7 +72,7 @@ u64 __init kaslr_early_init(void)
 	 * we end up running with module randomization disabled.
 	 */
 	module_alloc_base = (u64)_etext - MODULES_VSIZE;
-	__flush_dcache_area((unsigned long)&module_alloc_base,
+	dcache_clean_inval_poc((unsigned long)&module_alloc_base,
 			    (unsigned long)&module_alloc_base +
 				    sizeof(module_alloc_base));
 
@@ -172,10 +172,10 @@ u64 __init kaslr_early_init(void)
 	module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
 	module_alloc_base &= PAGE_MASK;
 
-	__flush_dcache_area((unsigned long)&module_alloc_base,
+	dcache_clean_inval_poc((unsigned long)&module_alloc_base,
 			    (unsigned long)&module_alloc_base +
 				    sizeof(module_alloc_base));
-	__flush_dcache_area((unsigned long)&memstart_offset_seed,
+	dcache_clean_inval_poc((unsigned long)&memstart_offset_seed,
 			    (unsigned long)&memstart_offset_seed +
 				    sizeof(memstart_offset_seed));
 
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 3e79110c8f3a..03ceabe4d912 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -72,10 +72,10 @@ int machine_kexec_post_load(struct kimage *kimage)
 	 * For execution with the MMU off, reloc_code needs to be cleaned to the
 	 * PoC and invalidated from the I-cache.
 	 */
-	__flush_dcache_area((unsigned long)reloc_code,
+	dcache_clean_inval_poc((unsigned long)reloc_code,
 			    (unsigned long)reloc_code +
 				    arm64_relocate_new_kernel_size);
-	invalidate_icache_range((uintptr_t)reloc_code,
+	icache_inval_pou((uintptr_t)reloc_code,
 				(uintptr_t)reloc_code +
 					arm64_relocate_new_kernel_size);
 
@@ -111,7 +111,7 @@ static void kexec_list_flush(struct kimage *kimage)
 		unsigned long addr;
 
 		/* flush the list entries. */
-		__flush_dcache_area((unsigned long)entry,
+		dcache_clean_inval_poc((unsigned long)entry,
 				    (unsigned long)entry +
 					    sizeof(kimage_entry_t));
 
@@ -128,7 +128,7 @@ static void kexec_list_flush(struct kimage *kimage)
 			break;
 		case IND_SOURCE:
 			/* flush the source pages. */
-			__flush_dcache_area(addr, addr + PAGE_SIZE);
+			dcache_clean_inval_poc(addr, addr + PAGE_SIZE);
 			break;
 		case IND_DESTINATION:
 			break;
@@ -155,7 +155,7 @@ static void kexec_segment_flush(const struct kimage *kimage)
 			kimage->segment[i].memsz,
 			kimage->segment[i].memsz /  PAGE_SIZE);
 
-		__flush_dcache_area(
+		dcache_clean_inval_poc(
 			(unsigned long)phys_to_virt(kimage->segment[i].mem),
 			(unsigned long)phys_to_virt(kimage->segment[i].mem) +
 				kimage->segment[i].memsz);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 5fcdee331087..9b4c1118194d 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -122,7 +122,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	secondary_data.task = idle;
 	secondary_data.stack = task_stack_page(idle) + THREAD_SIZE;
 	update_cpu_boot_status(CPU_MMU_OFF);
-	__flush_dcache_area((unsigned long)&secondary_data,
+	dcache_clean_inval_poc((unsigned long)&secondary_data,
 			    (unsigned long)&secondary_data +
 				    sizeof(secondary_data));
 
@@ -145,7 +145,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	pr_crit("CPU%u: failed to come online\n", cpu);
 	secondary_data.task = NULL;
 	secondary_data.stack = NULL;
-	__flush_dcache_area((unsigned long)&secondary_data,
+	dcache_clean_inval_poc((unsigned long)&secondary_data,
 			    (unsigned long)&secondary_data +
 				    sizeof(secondary_data));
 	status = READ_ONCE(secondary_data.status);
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 58d804582a35..7e1624ecab3c 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -36,7 +36,7 @@ static void write_pen_release(u64 val)
 	unsigned long size = sizeof(secondary_holding_pen_release);
 
 	secondary_holding_pen_release = val;
-	__flush_dcache_area((unsigned long)start, (unsigned long)start + size);
+	dcache_clean_inval_poc((unsigned long)start, (unsigned long)start + size);
 }
 
 
@@ -90,7 +90,7 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
 	 * the boot protocol.
 	 */
 	writeq_relaxed(pa_holding_pen, release_addr);
-	__flush_dcache_area((__force unsigned long)release_addr,
+	dcache_clean_inval_poc((__force unsigned long)release_addr,
 			    (__force unsigned long)release_addr +
 				    sizeof(*release_addr));
 
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 265fe3eb1069..db5159a3055f 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -41,7 +41,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
 			dsb(ish);
 		}
 
-		ret = __flush_cache_user_range(start, start + chunk);
+		ret = caches_clean_inval_user_pou(start, start + chunk);
 		if (ret)
 			return ret;
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1cb39c0803a4..c1953f65ca0e 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1064,7 +1064,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 		if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 			stage2_unmap_vm(vcpu->kvm);
 		else
-			__flush_icache_all();
+			icache_inval_all_pou();
 	}
 
 	vcpu_reset_hcr(vcpu);
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 36cef6915428..958734f4d6b0 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -7,7 +7,7 @@
 #include <asm/assembler.h>
 #include <asm/alternative.h>
 
-SYM_FUNC_START_PI(__flush_dcache_area)
+SYM_FUNC_START_PI(dcache_clean_inval_poc)
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__flush_dcache_area)
+SYM_FUNC_END_PI(dcache_clean_inval_poc)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 5dffe928f256..8143ebd4fb72 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -134,7 +134,7 @@ static void update_nvhe_init_params(void)
 	for (i = 0; i < hyp_nr_cpus; i++) {
 		params = per_cpu_ptr(&kvm_init_params, i);
 		params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
-		__flush_dcache_area((unsigned long)params,
+		dcache_clean_inval_poc((unsigned long)params,
 				    (unsigned long)params + sizeof(*params));
 	}
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 83dc3b271bc5..38ed0f6f2703 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -104,7 +104,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	 * you should be running with VHE enabled.
 	 */
 	if (icache_is_vpipt())
-		__flush_icache_all();
+		icache_inval_all_pou();
 
 	__tlb_switch_to_host(&cxt);
 }
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 10d2f04013d4..e9ad7fb28ee3 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -841,7 +841,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	if (need_flush) {
 		kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
 
-		__flush_dcache_area((unsigned long)pte_follow,
+		dcache_clean_inval_poc((unsigned long)pte_follow,
 				    (unsigned long)pte_follow +
 					    kvm_granule_size(level));
 	}
@@ -997,7 +997,7 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 		return 0;
 
 	pte_follow = kvm_pte_follow(pte, mm_ops);
-	__flush_dcache_area((unsigned long)pte_follow,
+	dcache_clean_inval_poc((unsigned long)pte_follow,
 			    (unsigned long)pte_follow +
 				    kvm_granule_size(level));
 	return 0;
diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c
index 62ea989effe8..baee22961bdb 100644
--- a/arch/arm64/lib/uaccess_flushcache.c
+++ b/arch/arm64/lib/uaccess_flushcache.c
@@ -15,7 +15,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt)
 	 * barrier to order the cache maintenance against the memcpy.
 	 */
 	memcpy(dst, src, cnt);
-	__clean_dcache_area_pop((unsigned long)dst, (unsigned long)dst + cnt);
+	dcache_clean_pop((unsigned long)dst, (unsigned long)dst + cnt);
 }
 EXPORT_SYMBOL_GPL(memcpy_flushcache);
 
@@ -33,6 +33,6 @@ unsigned long __copy_user_flushcache(void *to, const void __user *from,
 	rc = raw_copy_from_user(to, from, n);
 
 	/* See above */
-	__clean_dcache_area_pop((unsigned long)to, (unsigned long)to + n - rc);
+	dcache_clean_pop((unsigned long)to, (unsigned long)to + n - rc);
 	return rc;
 }
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index ea605d94182f..5051b3c1a4f1 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -15,7 +15,7 @@
 #include <asm/asm-uaccess.h>
 
 /*
- *	__flush_cache_range(start,end) [fixup]
+ *	caches_clean_inval_pou_macro(start,end) [fixup]
  *
  *	Ensure that the I and D caches are coherent within specified region.
  *	This is typically used when code has been written to a memory region,
@@ -25,7 +25,7 @@
  *	- end     - virtual end address of region
  *	- fixup   - optional label to branch to on user fault
  */
-.macro	__flush_cache_range, fixup
+.macro	caches_clean_inval_pou_macro, fixup
 alternative_if ARM64_HAS_CACHE_IDC
 	dsb     ishst
 	b       .Ldc_skip_\@
@@ -43,7 +43,7 @@ alternative_else_nop_endif
 .endm
 
 /*
- *	__flush_icache_range(start,end)
+ *	caches_clean_inval_pou(start,end)
  *
  *	Ensure that the I and D caches are coherent within specified region.
  *	This is typically used when code has been written to a memory region,
@@ -52,13 +52,13 @@ alternative_else_nop_endif
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START(__flush_icache_range)
-	__flush_cache_range
+SYM_FUNC_START(caches_clean_inval_pou)
+	caches_clean_inval_pou_macro
 	ret
-SYM_FUNC_END(__flush_icache_range)
+SYM_FUNC_END(caches_clean_inval_pou)
 
 /*
- *	__flush_cache_user_range(start,end)
+ *	caches_clean_inval_user_pou(start,end)
  *
  *	Ensure that the I and D caches are coherent within specified region.
  *	This is typically used when code has been written to a memory region,
@@ -67,10 +67,10 @@ SYM_FUNC_END(__flush_icache_range)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START(__flush_cache_user_range)
+SYM_FUNC_START(caches_clean_inval_user_pou)
 	uaccess_ttbr0_enable x2, x3, x4
 
-	__flush_cache_range 2f
+	caches_clean_inval_pou_macro 2f
 	mov	x0, xzr
 1:
 	uaccess_ttbr0_disable x1, x2
@@ -78,17 +78,17 @@ SYM_FUNC_START(__flush_cache_user_range)
 2:
 	mov	x0, #-EFAULT
 	b	1b
-SYM_FUNC_END(__flush_cache_user_range)
+SYM_FUNC_END(caches_clean_inval_user_pou)
 
 /*
- *	invalidate_icache_range(start,end)
+ *	icache_inval_pou(start,end)
  *
  *	Ensure that the I cache is invalid within specified region.
  *
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START(invalidate_icache_range)
+SYM_FUNC_START(icache_inval_pou)
 alternative_if ARM64_HAS_CACHE_DIC
 	isb
 	ret
@@ -96,10 +96,10 @@ alternative_else_nop_endif
 
 	invalidate_icache_by_line x0, x1, x2, x3
 	ret
-SYM_FUNC_END(invalidate_icache_range)
+SYM_FUNC_END(icache_inval_pou)
 
 /*
- *	__flush_dcache_area(start, end)
+ *	dcache_clean_inval_poc(start, end)
  *
  *	Ensure that any D-cache lines for the interval [start, end)
  *	are cleaned and invalidated to the PoC.
@@ -107,13 +107,13 @@ SYM_FUNC_END(invalidate_icache_range)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START_PI(__flush_dcache_area)
+SYM_FUNC_START_PI(dcache_clean_inval_poc)
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__flush_dcache_area)
+SYM_FUNC_END_PI(dcache_clean_inval_poc)
 
 /*
- *	__clean_dcache_area_pou(start, end)
+ *	dcache_clean_pou(start, end)
  *
  * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoU.
@@ -121,17 +121,17 @@ SYM_FUNC_END_PI(__flush_dcache_area)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START(__clean_dcache_area_pou)
+SYM_FUNC_START(dcache_clean_pou)
 alternative_if ARM64_HAS_CACHE_IDC
 	dsb	ishst
 	ret
 alternative_else_nop_endif
 	dcache_by_line_op cvau, ish, x0, x1, x2, x3
 	ret
-SYM_FUNC_END(__clean_dcache_area_pou)
+SYM_FUNC_END(dcache_clean_pou)
 
 /*
- *	__inval_dcache_area(start, end)
+ *	dcache_inval_poc(start, end)
  *
  * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are invalidated. Any partial lines at the ends of the interval are
@@ -141,7 +141,7 @@ SYM_FUNC_END(__clean_dcache_area_pou)
  *	- end     - kernel end address of region
  */
 SYM_FUNC_START_LOCAL(__dma_inv_area)
-SYM_FUNC_START_PI(__inval_dcache_area)
+SYM_FUNC_START_PI(dcache_inval_poc)
 	/* FALLTHROUGH */
 
 /*
@@ -166,11 +166,11 @@ SYM_FUNC_START_PI(__inval_dcache_area)
 	b.lo	2b
 	dsb	sy
 	ret
-SYM_FUNC_END_PI(__inval_dcache_area)
+SYM_FUNC_END_PI(dcache_inval_poc)
 SYM_FUNC_END(__dma_inv_area)
 
 /*
- *	__clean_dcache_area_poc(start, end)
+ *	dcache_clean_poc(start, end)
  *
  * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoC.
@@ -179,7 +179,7 @@ SYM_FUNC_END(__dma_inv_area)
  *	- end     - virtual end address of region
  */
 SYM_FUNC_START_LOCAL(__dma_clean_area)
-SYM_FUNC_START_PI(__clean_dcache_area_poc)
+SYM_FUNC_START_PI(dcache_clean_poc)
 	/* FALLTHROUGH */
 
 /*
@@ -189,11 +189,11 @@ SYM_FUNC_START_PI(__clean_dcache_area_poc)
  */
 	dcache_by_line_op cvac, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__clean_dcache_area_poc)
+SYM_FUNC_END_PI(dcache_clean_poc)
 SYM_FUNC_END(__dma_clean_area)
 
 /*
- *	__clean_dcache_area_pop(start, end)
+ *	dcache_clean_pop(start, end)
  *
  * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoP.
@@ -201,13 +201,13 @@ SYM_FUNC_END(__dma_clean_area)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START_PI(__clean_dcache_area_pop)
+SYM_FUNC_START_PI(dcache_clean_pop)
 	alternative_if_not ARM64_HAS_DCPOP
-	b	__clean_dcache_area_poc
+	b	dcache_clean_poc
 	alternative_else_nop_endif
 	dcache_by_line_op cvap, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__clean_dcache_area_pop)
+SYM_FUNC_END_PI(dcache_clean_pop)
 
 /*
  *	__dma_flush_area(start, size)
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index c4ca7e05fdb8..2aaf950b906c 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -17,14 +17,14 @@
 void sync_icache_aliases(unsigned long start, unsigned long end)
 {
 	if (icache_is_aliasing()) {
-		__clean_dcache_area_pou(start, end);
-		__flush_icache_all();
+		dcache_clean_pou(start, end);
+		icache_inval_all_pou();
 	} else {
 		/*
 		 * Don't issue kick_all_cpus_sync() after I-cache invalidation
 		 * for user mappings.
 		 */
-		__flush_icache_range(start, end);
+		caches_clean_inval_pou(start, end);
 	}
 }
 
@@ -76,20 +76,20 @@ EXPORT_SYMBOL(flush_dcache_page);
 /*
  * Additional functions defined in assembly.
  */
-EXPORT_SYMBOL(__flush_icache_range);
+EXPORT_SYMBOL(caches_clean_inval_pou);
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size)
 {
 	/* Ensure order against any prior non-cacheable writes */
 	dmb(osh);
-	__clean_dcache_area_pop((unsigned long)addr, (unsigned long)addr + size);
+	dcache_clean_pop((unsigned long)addr, (unsigned long)addr + size);
 }
 EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 
 void arch_invalidate_pmem(void *addr, size_t size)
 {
-	__inval_dcache_area((unsigned long)addr, (unsigned long)addr + size);
+	dcache_inval_poc((unsigned long)addr, (unsigned long)addr + size);
 }
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 #endif
-- 
cgit v1.2.3


From 3fdc0cb59d97f87e2cc708d424f1538e31744286 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 18 May 2021 17:36:18 +0100
Subject: arm64: smccc: Add support for SMCCCv1.2 extended input/output
 registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SMCCC v1.2 allows x8-x17 to be used as parameter registers and x4—x17
to be used as result registers in SMC64/HVC64. Arm Firmware Framework
for Armv8-A specification makes use of x0-x7 as parameter and result
registers. There are other users like Hyper-V who intend to use beyond
x0-x7 as well.

Current SMCCC interface in the kernel just use x0-x7 as parameter and
x0-x3 as result registers as required by SMCCCv1.0. Let us add new
interface to support this extended set of input/output registers namely
x0-x17 as both parameter and result registers.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20210518163618.43950-1-sudeep.holla@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/asm-offsets.c |  9 +++++++
 arch/arm64/kernel/smccc-call.S  | 57 +++++++++++++++++++++++++++++++++++++++++
 include/linux/arm-smccc.h       | 55 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0cb34ccb6e73..74321bc9a459 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -138,6 +138,15 @@ int main(void)
   DEFINE(ARM_SMCCC_RES_X2_OFFS,		offsetof(struct arm_smccc_res, a2));
   DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,	offsetof(struct arm_smccc_quirk, id));
   DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,	offsetof(struct arm_smccc_quirk, state));
+  DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS,	offsetof(struct arm_smccc_1_2_regs, a0));
+  DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS,	offsetof(struct arm_smccc_1_2_regs, a2));
+  DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS,	offsetof(struct arm_smccc_1_2_regs, a4));
+  DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS,	offsetof(struct arm_smccc_1_2_regs, a6));
+  DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS,	offsetof(struct arm_smccc_1_2_regs, a8));
+  DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS,	offsetof(struct arm_smccc_1_2_regs, a10));
+  DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS,	offsetof(struct arm_smccc_1_2_regs, a12));
+  DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS,	offsetof(struct arm_smccc_1_2_regs, a14));
+  DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS,	offsetof(struct arm_smccc_1_2_regs, a16));
   BLANK();
   DEFINE(HIBERN_PBE_ORIG,	offsetof(struct pbe, orig_address));
   DEFINE(HIBERN_PBE_ADDR,	offsetof(struct pbe, address));
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index d62447964ed9..2def9d0dd3dd 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -43,3 +43,60 @@ SYM_FUNC_START(__arm_smccc_hvc)
 	SMCCC	hvc
 SYM_FUNC_END(__arm_smccc_hvc)
 EXPORT_SYMBOL(__arm_smccc_hvc)
+
+	.macro SMCCC_1_2 instr
+	/* Save `res` and free a GPR that won't be clobbered */
+	stp     x1, x19, [sp, #-16]!
+
+	/* Ensure `args` won't be clobbered while loading regs in next step */
+	mov	x19, x0
+
+	/* Load the registers x0 - x17 from the struct arm_smccc_1_2_regs */
+	ldp	x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS]
+	ldp	x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS]
+	ldp	x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS]
+	ldp	x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS]
+	ldp	x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS]
+	ldp	x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS]
+	ldp	x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS]
+	ldp	x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS]
+	ldp	x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS]
+
+	\instr #0
+
+	/* Load the `res` from the stack */
+	ldr	x19, [sp]
+
+	/* Store the registers x0 - x17 into the result structure */
+	stp	x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS]
+	stp	x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS]
+	stp	x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS]
+	stp	x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS]
+	stp	x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS]
+	stp	x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS]
+	stp	x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS]
+	stp	x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS]
+	stp	x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS]
+
+	/* Restore original x19 */
+	ldp     xzr, x19, [sp], #16
+	ret
+.endm
+
+/*
+ * void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args,
+ *			  struct arm_smccc_1_2_regs *res);
+ */
+SYM_FUNC_START(arm_smccc_1_2_hvc)
+	SMCCC_1_2 hvc
+SYM_FUNC_END(arm_smccc_1_2_hvc)
+EXPORT_SYMBOL(arm_smccc_1_2_hvc)
+
+/*
+ * void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
+ *			  struct arm_smccc_1_2_regs *res);
+ */
+SYM_FUNC_START(arm_smccc_1_2_smc)
+	SMCCC_1_2 smc
+SYM_FUNC_END(arm_smccc_1_2_smc)
+EXPORT_SYMBOL(arm_smccc_1_2_smc)
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 6861489a1890..5cef2b8b0479 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -227,6 +227,61 @@ struct arm_smccc_res {
 	unsigned long a3;
 };
 
+#ifdef CONFIG_ARM64
+/**
+ * struct arm_smccc_1_2_regs - Arguments for or Results from SMC/HVC call
+ * @a0-a17 argument values from registers 0 to 17
+ */
+struct arm_smccc_1_2_regs {
+	unsigned long a0;
+	unsigned long a1;
+	unsigned long a2;
+	unsigned long a3;
+	unsigned long a4;
+	unsigned long a5;
+	unsigned long a6;
+	unsigned long a7;
+	unsigned long a8;
+	unsigned long a9;
+	unsigned long a10;
+	unsigned long a11;
+	unsigned long a12;
+	unsigned long a13;
+	unsigned long a14;
+	unsigned long a15;
+	unsigned long a16;
+	unsigned long a17;
+};
+
+/**
+ * arm_smccc_1_2_hvc() - make HVC calls
+ * @args: arguments passed via struct arm_smccc_1_2_regs
+ * @res: result values via struct arm_smccc_1_2_regs
+ *
+ * This function is used to make HVC calls following SMC Calling Convention
+ * v1.2 or above. The content of the supplied param are copied from the
+ * structure to registers prior to the HVC instruction. The return values
+ * are updated with the content from registers on return from the HVC
+ * instruction.
+ */
+asmlinkage void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args,
+				  struct arm_smccc_1_2_regs *res);
+
+/**
+ * arm_smccc_1_2_smc() - make SMC calls
+ * @args: arguments passed via struct arm_smccc_1_2_regs
+ * @res: result values via struct arm_smccc_1_2_regs
+ *
+ * This function is used to make SMC calls following SMC Calling Convention
+ * v1.2 or above. The content of the supplied param are copied from the
+ * structure to registers prior to the SMC instruction. The return values
+ * are updated with the content from registers on return from the SMC
+ * instruction.
+ */
+asmlinkage void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
+				  struct arm_smccc_1_2_regs *res);
+#endif
+
 /**
  * struct arm_smccc_quirk - Contains quirk information
  * @id: quirk identification
-- 
cgit v1.2.3


From 76734d26b54192a31440039459eef2612da63ed4 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 26 May 2021 10:49:25 -0700
Subject: arm64: Change the on_*stack functions to take a size argument

unwind_frame() was previously implicitly checking that the frame
record is in bounds of the stack by enforcing that FP is both aligned
to 16 and in bounds of the stack. Once the FP alignment requirement
is relaxed to 8 this will not be sufficient because it does not
account for the case where FP points to 8 bytes before the end of the
stack.

Make the check explicit by changing the on_*stack functions to take a
size argument and adjusting the callers to pass the appropriate sizes.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/Ib7a3eb3eea41b0687ffaba045ceb2012d077d8b4
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210526174927.2477847-1-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/processor.h  | 12 ++++++------
 arch/arm64/include/asm/sdei.h       |  7 ++++---
 arch/arm64/include/asm/stacktrace.h | 32 ++++++++++++++++----------------
 arch/arm64/kernel/ptrace.c          |  2 +-
 arch/arm64/kernel/sdei.c            | 16 +++++++++-------
 arch/arm64/kernel/stacktrace.c      |  2 +-
 6 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 9df3feeee890..7a094aafec20 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -329,13 +329,13 @@ long get_tagged_addr_ctrl(struct task_struct *task);
  * of header definitions for the use of task_stack_page.
  */
 
-#define current_top_of_stack()							\
-({										\
-	struct stack_info _info;						\
-	BUG_ON(!on_accessible_stack(current, current_stack_pointer, &_info));	\
-	_info.high;								\
+#define current_top_of_stack()								\
+({											\
+	struct stack_info _info;							\
+	BUG_ON(!on_accessible_stack(current, current_stack_pointer, 1, &_info));	\
+	_info.high;									\
 })
-#define on_thread_stack()	(on_task_stack(current, current_stack_pointer, NULL))
+#define on_thread_stack()	(on_task_stack(current, current_stack_pointer, 1, NULL))
 
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 63e0b92a5fbb..8bc30a5c4569 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -42,8 +42,9 @@ unsigned long sdei_arch_get_entry_point(int conduit);
 
 struct stack_info;
 
-bool _on_sdei_stack(unsigned long sp, struct stack_info *info);
-static inline bool on_sdei_stack(unsigned long sp,
+bool _on_sdei_stack(unsigned long sp, unsigned long size,
+		    struct stack_info *info);
+static inline bool on_sdei_stack(unsigned long sp, unsigned long size,
 				struct stack_info *info)
 {
 	if (!IS_ENABLED(CONFIG_VMAP_STACK))
@@ -51,7 +52,7 @@ static inline bool on_sdei_stack(unsigned long sp,
 	if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE))
 		return false;
 	if (in_nmi())
-		return _on_sdei_stack(sp, info);
+		return _on_sdei_stack(sp, size, info);
 
 	return false;
 }
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 4b33ca620679..1801399204d7 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -69,14 +69,14 @@ extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 
 DECLARE_PER_CPU(unsigned long *, irq_stack_ptr);
 
-static inline bool on_stack(unsigned long sp, unsigned long low,
-				unsigned long high, enum stack_type type,
-				struct stack_info *info)
+static inline bool on_stack(unsigned long sp, unsigned long size,
+			    unsigned long low, unsigned long high,
+			    enum stack_type type, struct stack_info *info)
 {
 	if (!low)
 		return false;
 
-	if (sp < low || sp >= high)
+	if (sp < low || sp + size < sp || sp + size > high)
 		return false;
 
 	if (info) {
@@ -87,38 +87,38 @@ static inline bool on_stack(unsigned long sp, unsigned long low,
 	return true;
 }
 
-static inline bool on_irq_stack(unsigned long sp,
+static inline bool on_irq_stack(unsigned long sp, unsigned long size,
 				struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr);
 	unsigned long high = low + IRQ_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_IRQ, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_IRQ, info);
 }
 
 static inline bool on_task_stack(const struct task_struct *tsk,
-				 unsigned long sp,
+				 unsigned long sp, unsigned long size,
 				 struct stack_info *info)
 {
 	unsigned long low = (unsigned long)task_stack_page(tsk);
 	unsigned long high = low + THREAD_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_TASK, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_TASK, info);
 }
 
 #ifdef CONFIG_VMAP_STACK
 DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack);
 
-static inline bool on_overflow_stack(unsigned long sp,
+static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
 				struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack);
 	unsigned long high = low + OVERFLOW_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_OVERFLOW, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
 }
 #else
-static inline bool on_overflow_stack(unsigned long sp,
+static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
 			struct stack_info *info) { return false; }
 #endif
 
@@ -128,21 +128,21 @@ static inline bool on_overflow_stack(unsigned long sp,
  * context.
  */
 static inline bool on_accessible_stack(const struct task_struct *tsk,
-				       unsigned long sp,
+				       unsigned long sp, unsigned long size,
 				       struct stack_info *info)
 {
 	if (info)
 		info->type = STACK_TYPE_UNKNOWN;
 
-	if (on_task_stack(tsk, sp, info))
+	if (on_task_stack(tsk, sp, size, info))
 		return true;
 	if (tsk != current || preemptible())
 		return false;
-	if (on_irq_stack(sp, info))
+	if (on_irq_stack(sp, size, info))
 		return true;
-	if (on_overflow_stack(sp, info))
+	if (on_overflow_stack(sp, size, info))
 		return true;
-	if (on_sdei_stack(sp, info))
+	if (on_sdei_stack(sp, size, info))
 		return true;
 
 	return false;
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index eb2f73939b7b..499b6b2f9757 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -122,7 +122,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
 {
 	return ((addr & ~(THREAD_SIZE - 1))  ==
 		(kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) ||
-		on_irq_stack(addr, NULL);
+		on_irq_stack(addr, sizeof(unsigned long), NULL);
 }
 
 /**
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 2c7ca449dd51..c524f96f97c4 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -162,31 +162,33 @@ static int init_sdei_scs(void)
 	return err;
 }
 
-static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info)
+static bool on_sdei_normal_stack(unsigned long sp, unsigned long size,
+				 struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
 	unsigned long high = low + SDEI_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_SDEI_NORMAL, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_SDEI_NORMAL, info);
 }
 
-static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info)
+static bool on_sdei_critical_stack(unsigned long sp, unsigned long size,
+				   struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
 	unsigned long high = low + SDEI_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_SDEI_CRITICAL, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_SDEI_CRITICAL, info);
 }
 
-bool _on_sdei_stack(unsigned long sp, struct stack_info *info)
+bool _on_sdei_stack(unsigned long sp, unsigned long size, struct stack_info *info)
 {
 	if (!IS_ENABLED(CONFIG_VMAP_STACK))
 		return false;
 
-	if (on_sdei_critical_stack(sp, info))
+	if (on_sdei_critical_stack(sp, size, info))
 		return true;
 
-	if (on_sdei_normal_stack(sp, info))
+	if (on_sdei_normal_stack(sp, size, info))
 		return true;
 
 	return false;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 36cf05d5eb9e..5c70f247645b 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -78,7 +78,7 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	if (fp & 0xf)
 		return -EINVAL;
 
-	if (!on_accessible_stack(tsk, fp, &info))
+	if (!on_accessible_stack(tsk, fp, 16, &info))
 		return -EINVAL;
 
 	if (test_bit(info.type, frame->stacks_done))
-- 
cgit v1.2.3


From 33c222aeda14596ca5b9a1a3002858c6c3565ddd Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 26 May 2021 10:49:26 -0700
Subject: arm64: stacktrace: Relax frame record alignment requirement to 8
 bytes

The AAPCS places no requirements on the alignment of the frame
record. In theory it could be placed anywhere, although it seems
sensible to require it to be aligned to 8 bytes. With an upcoming
enhancement to tag-based KASAN Clang will begin creating frame records
located at an address that is only aligned to 8 bytes. Accommodate
such frame records in the stack unwinding code.

As pointed out by Mark Rutland, the userspace stack unwinding code
has the same problem, so fix it there as well.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/Ia22c375230e67ca055e9e4bb639383567f7ad268
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210526174927.2477847-2-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_callchain.c | 2 +-
 arch/arm64/kernel/stacktrace.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 88ff471b0bce..4a72c2727309 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -116,7 +116,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		tail = (struct frame_tail __user *)regs->regs[29];
 
 		while (entry->nr < entry->max_stack &&
-		       tail && !((unsigned long)tail & 0xf))
+		       tail && !((unsigned long)tail & 0x7))
 			tail = user_backtrace(tail, entry);
 	} else {
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 5c70f247645b..b189de5ca6cb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -75,7 +75,7 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
 		return -ENOENT;
 
-	if (fp & 0xf)
+	if (fp & 0x7)
 		return -EINVAL;
 
 	if (!on_accessible_stack(tsk, fp, 16, &info))
-- 
cgit v1.2.3


From 483dbf6a35907610597fdc304bd32ecba40cdff0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 12 May 2021 16:11:29 +0100
Subject: arm64/sve: Split _sve_flush macro into separate Z and predicate
 flushes

Trivial refactoring to support further work, no change to generated code.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210512151131.27877-2-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/fpsimdmacros.h | 4 +++-
 arch/arm64/kernel/entry-fpsimd.S      | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index a2563992d2dc..059204477ce6 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -213,8 +213,10 @@
 	mov	v\nz\().16b, v\nz\().16b
 .endm
 
-.macro sve_flush
+.macro sve_flush_z
  _for n, 0, 31, _sve_flush_z	\n
+.endm
+.macro sve_flush_p_ffr
  _for n, 0, 15, _sve_pfalse	\n
 		_sve_wrffr	0
 .endm
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 3ecec60d3295..7921d58427c2 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -72,7 +72,8 @@ SYM_FUNC_END(sve_load_from_fpsimd_state)
 
 /* Zero all SVE registers but the first 128-bits of each vector */
 SYM_FUNC_START(sve_flush_live)
-	sve_flush
+	sve_flush_z
+	sve_flush_p_ffr
 	ret
 SYM_FUNC_END(sve_flush_live)
 
-- 
cgit v1.2.3


From c9f6890bca111a879a8af1f2390ac49cf05b11df Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 12 May 2021 16:11:30 +0100
Subject: arm64/sve: Use the sve_flush macros in sve_load_from_fpsimd_state()

This makes the code a bit clearer and as a result we can also make the
indentation more normal, there is no change to the generated code.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210512151131.27877-3-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-fpsimd.S | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 7921d58427c2..dd8382e5ce82 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -63,11 +63,10 @@ SYM_FUNC_END(sve_set_vq)
  * and the rest zeroed. All the other SVE registers will be zeroed.
  */
 SYM_FUNC_START(sve_load_from_fpsimd_state)
-		sve_load_vq	x1, x2, x3
-		fpsimd_restore	x0, 8
- _for n, 0, 15, _sve_pfalse	\n
-		_sve_wrffr	0
-		ret
+	sve_load_vq	x1, x2, x3
+	fpsimd_restore	x0, 8
+	sve_flush_p_ffr
+	ret
 SYM_FUNC_END(sve_load_from_fpsimd_state)
 
 /* Zero all SVE registers but the first 128-bits of each vector */
-- 
cgit v1.2.3


From ad4711f962e08eff8d6e9b03f9670b1af6ea9395 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 12 May 2021 16:11:31 +0100
Subject: arm64/sve: Skip flushing Z registers with 128 bit vectors

When the SVE vector length is 128 bits then there are no bits in the Z
registers which are not shared with the V registers so we can skip them
when zeroing state not shared with FPSIMD, this results in a minor
performance improvement.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210512151131.27877-4-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/fpsimd.h  |  2 +-
 arch/arm64/kernel/entry-fpsimd.S | 12 ++++++++++--
 arch/arm64/kernel/fpsimd.c       |  6 ++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 2599504674b5..c072161d5c65 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -69,7 +69,7 @@ static inline void *sve_pffr(struct thread_struct *thread)
 extern void sve_save_state(void *state, u32 *pfpsr);
 extern void sve_load_state(void const *state, u32 const *pfpsr,
 			   unsigned long vq_minus_1);
-extern void sve_flush_live(void);
+extern void sve_flush_live(unsigned long vq_minus_1);
 extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state,
 				       unsigned long vq_minus_1);
 extern unsigned int sve_get_vl(void);
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index dd8382e5ce82..0a7a64753878 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -69,10 +69,18 @@ SYM_FUNC_START(sve_load_from_fpsimd_state)
 	ret
 SYM_FUNC_END(sve_load_from_fpsimd_state)
 
-/* Zero all SVE registers but the first 128-bits of each vector */
+/*
+ * Zero all SVE registers but the first 128-bits of each vector
+ *
+ * VQ must already be configured by caller, any further updates of VQ
+ * will need to ensure that the register state remains valid.
+ *
+ * x0 = VQ - 1
+ */
 SYM_FUNC_START(sve_flush_live)
+	cbz		x0, 1f	// A VQ-1 of 0 is 128 bits so no extra Z state
 	sve_flush_z
-	sve_flush_p_ffr
+1:	sve_flush_p_ffr
 	ret
 SYM_FUNC_END(sve_flush_live)
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index ad3dd34a83cf..e57b23f95284 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -957,8 +957,10 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
 	 * disabling the trap, otherwise update our in-memory copy.
 	 */
 	if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
-		sve_set_vq(sve_vq_from_vl(current->thread.sve_vl) - 1);
-		sve_flush_live();
+		unsigned long vq_minus_one =
+			sve_vq_from_vl(current->thread.sve_vl) - 1;
+		sve_set_vq(vq_minus_one);
+		sve_flush_live(vq_minus_one);
 		fpsimd_bind_task_to_cpu();
 	} else {
 		fpsimd_to_sve(current);
-- 
cgit v1.2.3


From 7513cc8a1b741bee6fb39cbb94a9842d37ca3ace Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 26 May 2021 20:36:20 +0100
Subject: arm64: Change the cpuinfo_arm64 member type for some sysregs to u64

The architecture has been updated and the CTR_EL0, CNTFRQ_EL0,
DCZID_EL0, MIDR_EL1, REVIDR_EL1 registers are all 64-bit, even if most
of them have a RES0 top 32-bit.

Change their type to u64 in struct cpuinfo_arm64.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Suzuki K Poulose <Suzuki.Poulose@arm.com>
Link: https://lore.kernel.org/r/20210526193621.21559-2-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpu.h | 10 +++++-----
 arch/arm64/kernel/cpuinfo.c  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 7faae6ff3ab4..fe5a8499ddc2 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -15,11 +15,11 @@
 struct cpuinfo_arm64 {
 	struct cpu	cpu;
 	struct kobject	kobj;
-	u32		reg_ctr;
-	u32		reg_cntfrq;
-	u32		reg_dczid;
-	u32		reg_midr;
-	u32		reg_revidr;
+	u64		reg_ctr;
+	u64		reg_cntfrq;
+	u64		reg_dczid;
+	u64		reg_midr;
+	u64		reg_revidr;
 
 	u64		reg_id_aa64dfr0;
 	u64		reg_id_aa64dfr1;
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 51fcf99d5351..0e9e965e18d8 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -246,7 +246,7 @@ static struct kobj_type cpuregs_kobj_type = {
 		struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj);		\
 										\
 		if (info->reg_midr)						\
-			return sprintf(buf, "0x%016x\n", info->reg_##_field);	\
+			return sprintf(buf, "0x%016llx\n", info->reg_##_field);	\
 		else								\
 			return 0;						\
 	}									\
-- 
cgit v1.2.3


From 21047e91a5a674b97ebbf2c2c1751f1e9c317f09 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 26 May 2021 20:36:21 +0100
Subject: arm64: Check if GMID_EL1.BS is the same on all CPUs

The GMID_EL1.BS field determines the number of tags accessed by the
LDGM/STGM instructions (EL1 and up), used by the kernel for copying or
zeroing page tags.

Taint the kernel if GMID_EL1.BS differs between CPUs but only of
CONFIG_ARM64_MTE is enabled.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <Suzuki.Poulose@arm.com>
Link: https://lore.kernel.org/r/20210526193621.21559-3-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpu.h        |  1 +
 arch/arm64/include/asm/cpufeature.h |  7 +++++++
 arch/arm64/kernel/cpufeature.c      | 21 +++++++++++++++++++++
 arch/arm64/kernel/cpuinfo.c         |  3 +++
 4 files changed, 32 insertions(+)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index fe5a8499ddc2..9088e72c7cf6 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -20,6 +20,7 @@ struct cpuinfo_arm64 {
 	u64		reg_dczid;
 	u64		reg_midr;
 	u64		reg_revidr;
+	u64		reg_gmid;
 
 	u64		reg_id_aa64dfr0;
 	u64		reg_id_aa64dfr1;
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 338840c00e8e..650de920e067 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -619,6 +619,13 @@ static inline bool id_aa64pfr0_sve(u64 pfr0)
 	return val > 0;
 }
 
+static inline bool id_aa64pfr1_mte(u64 pfr1)
+{
+	u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_MTE_SHIFT);
+
+	return val >= ID_AA64PFR1_MTE;
+}
+
 void __init setup_cpu_features(void);
 void check_local_cpu_capabilities(void);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index efed2830d141..0645300cc1a8 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -400,6 +400,11 @@ static const struct arm64_ftr_bits ftr_dczid[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_gmid[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, SYS_GMID_EL1_BS_SHIFT, 4, 0),
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_isar0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DIVIDE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DEBUG_SHIFT, 4, 0),
@@ -617,6 +622,9 @@ static const struct __ftr_reg_entry {
 	/* Op1 = 0, CRn = 1, CRm = 2 */
 	ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr),
 
+	/* Op1 = 1, CRn = 0, CRm = 0 */
+	ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid),
+
 	/* Op1 = 3, CRn = 0, CRm = 0 */
 	{ SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 },
 	ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid),
@@ -911,6 +919,9 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 		sve_init_vq_map();
 	}
 
+	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
+		init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid);
+
 	/*
 	 * Initialize the indirect array of CPU hwcaps capabilities pointers
 	 * before we handle the boot CPU below.
@@ -1134,6 +1145,16 @@ void update_cpu_features(int cpu,
 			sve_update_vq_map();
 	}
 
+	/*
+	 * The kernel uses the LDGM/STGM instructions and the number of tags
+	 * they read/write depends on the GMID_EL1.BS field. Check that the
+	 * value is the same on all CPUs.
+	 */
+	if (IS_ENABLED(CONFIG_ARM64_MTE) &&
+	    id_aa64pfr1_mte(info->reg_id_aa64pfr1))
+		taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu,
+					      info->reg_gmid, boot->reg_gmid);
+
 	/*
 	 * This relies on a sanitised view of the AArch64 ID registers
 	 * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last.
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 0e9e965e18d8..5321b8218591 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -371,6 +371,9 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
 	info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);
 
+	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
+		info->reg_gmid = read_cpuid(GMID_EL1);
+
 	/* Update the 32bit ID registers only if AArch32 is implemented */
 	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
 		info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
-- 
cgit v1.2.3


From e176e2677cccd458f99c69d16d27f86adcdd02e4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 20 May 2021 12:50:27 +0100
Subject: arm64: assembler: add set_this_cpu_offset

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210520115031.18509-3-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/assembler.h | 18 +++++++++++++-----
 arch/arm64/mm/proc.S               | 12 ++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 8418c1bd8f04..f0188903557f 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -232,15 +232,23 @@ lr	.req	x30		// link register
 	 * @dst: destination register
 	 */
 #if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
-	.macro	this_cpu_offset, dst
+	.macro	get_this_cpu_offset, dst
 	mrs	\dst, tpidr_el2
 	.endm
 #else
-	.macro	this_cpu_offset, dst
+	.macro	get_this_cpu_offset, dst
 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	\dst, tpidr_el1
 alternative_else
 	mrs	\dst, tpidr_el2
+alternative_endif
+	.endm
+
+	.macro	set_this_cpu_offset, src
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+	msr	tpidr_el1, \src
+alternative_else
+	msr	tpidr_el2, \src
 alternative_endif
 	.endm
 #endif
@@ -253,7 +261,7 @@ alternative_endif
 	.macro adr_this_cpu, dst, sym, tmp
 	adrp	\tmp, \sym
 	add	\dst, \tmp, #:lo12:\sym
-	this_cpu_offset \tmp
+	get_this_cpu_offset \tmp
 	add	\dst, \dst, \tmp
 	.endm
 
@@ -264,7 +272,7 @@ alternative_endif
 	 */
 	.macro ldr_this_cpu dst, sym, tmp
 	adr_l	\dst, \sym
-	this_cpu_offset \tmp
+	get_this_cpu_offset \tmp
 	ldr	\dst, [\dst, \tmp]
 	.endm
 
@@ -745,7 +753,7 @@ USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
 	cbz		\tmp, \lbl
 #endif
 	adr_l		\tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
-	this_cpu_offset	\tmp2
+	get_this_cpu_offset	\tmp2
 	ldr		w\tmp, [\tmp, \tmp2]
 	cbnz		w\tmp, \lbl	// yield on pending softirq in task context
 .Lnoyield_\@:
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 97d7bcd8d4f2..bc555cd5e6b1 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -83,11 +83,7 @@ SYM_FUNC_START(cpu_do_suspend)
 	mrs	x9, mdscr_el1
 	mrs	x10, oslsr_el1
 	mrs	x11, sctlr_el1
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	mrs	x12, tpidr_el1
-alternative_else
-	mrs	x12, tpidr_el2
-alternative_endif
+	get_this_cpu_offset x12
 	mrs	x13, sp_el0
 	stp	x2, x3, [x0]
 	stp	x4, x5, [x0, #16]
@@ -145,11 +141,7 @@ SYM_FUNC_START(cpu_do_resume)
 	msr	mdscr_el1, x10
 
 	msr	sctlr_el1, x12
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	msr	tpidr_el1, x13
-alternative_else
-	msr	tpidr_el2, x13
-alternative_endif
+	set_this_cpu_offset x13
 	msr	sp_el0, x14
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1
-- 
cgit v1.2.3


From 98c7a1666ee94af59a65f2787a887a05a546d163 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 20 May 2021 12:50:28 +0100
Subject: arm64: smp: remove pointless secondary_data maintenance

All reads and writes of secondary_data occur with the MMU on, using
coherent attributes, so there's no need to perform any cache maintenance
for this.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210520115031.18509-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/smp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dcd7041b2b07..92e83e8bac94 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -122,7 +122,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	secondary_data.task = idle;
 	secondary_data.stack = task_stack_page(idle) + THREAD_SIZE;
 	update_cpu_boot_status(CPU_MMU_OFF);
-	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
 
 	/* Now bring the CPU into our world */
 	ret = boot_secondary(cpu, idle);
@@ -143,7 +142,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	pr_crit("CPU%u: failed to come online\n", cpu);
 	secondary_data.task = NULL;
 	secondary_data.stack = NULL;
-	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
 	status = READ_ONCE(secondary_data.status);
 	if (status == CPU_MMU_OFF)
 		status = READ_ONCE(__early_cpu_boot_status);
-- 
cgit v1.2.3


From 3305e7f74a14cdb19e61af4febb098ad62820d71 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 20 May 2021 12:50:29 +0100
Subject: arm64: smp: remove stack from secondary_data

When we boot a secondary CPU, we pass it a task and a stack to use. As
the stack is always the task's stack, which can be derived from the
task, let's have the secondary CPU derive this itself and avoid passing
redundant information.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210520115031.18509-5-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/smp.h    | 2 --
 arch/arm64/kernel/asm-offsets.c | 1 -
 arch/arm64/kernel/head.S        | 7 ++++---
 arch/arm64/kernel/smp.c         | 2 --
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 0e357757c0cc..fc55f5a57a06 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -73,12 +73,10 @@ asmlinkage void secondary_start_kernel(void);
 
 /*
  * Initial data for bringing up a secondary CPU.
- * @stack  - sp for the secondary CPU
  * @status - Result passed back from the secondary CPU to
  *           indicate failure.
  */
 struct secondary_data {
-	void *stack;
 	struct task_struct *task;
 	long status;
 };
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0cb34ccb6e73..4a5e204c33af 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -99,7 +99,6 @@ int main(void)
   DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
   DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
   BLANK();
-  DEFINE(CPU_BOOT_STACK,	offsetof(struct secondary_data, stack));
   DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
   BLANK();
   DEFINE(FTR_OVR_VAL_OFFSET,	offsetof(struct arm64_ftr_override, val));
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cc2d45d54838..9be95e11367d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -645,11 +645,12 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 	isb
 
 	adr_l	x0, secondary_data
-	ldr	x1, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
-	cbz	x1, __secondary_too_slow
-	mov	sp, x1
 	ldr	x2, [x0, #CPU_BOOT_TASK]
 	cbz	x2, __secondary_too_slow
+
+	ldr	x1, [x2, #TSK_STACK]
+	add	sp, x1, #THREAD_SIZE
+
 	msr	sp_el0, x2
 	scs_load x2, x3
 	setup_final_frame
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 92e83e8bac94..73625cc39574 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -120,7 +120,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	 * page tables.
 	 */
 	secondary_data.task = idle;
-	secondary_data.stack = task_stack_page(idle) + THREAD_SIZE;
 	update_cpu_boot_status(CPU_MMU_OFF);
 
 	/* Now bring the CPU into our world */
@@ -141,7 +140,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 
 	pr_crit("CPU%u: failed to come online\n", cpu);
 	secondary_data.task = NULL;
-	secondary_data.stack = NULL;
 	status = READ_ONCE(secondary_data.status);
 	if (status == CPU_MMU_OFF)
 		status = READ_ONCE(__early_cpu_boot_status);
-- 
cgit v1.2.3


From 8e334d729bc4787f728e9e5abc91649f131124ff Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 20 May 2021 12:50:30 +0100
Subject: arm64: smp: unify task and sp setup

Once we enable the MMU, we have to initialize:

* SP_EL0 to point at the active task
* SP to point at the active task's stack
* SCS_SP to point at the active task's shadow stack

For all tasks (including init_task), this information can be derived
from the task's task_struct.

Let's unify __primary_switched and __secondary_switched to consistently
acquire this information from the relevant task_struct. At the same
time, let's fold this together with initializing a task's final frame.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210520115031.18509-6-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/head.S | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 9be95e11367d..e83b2899dce5 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -395,15 +395,24 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 SYM_FUNC_END(__create_page_tables)
 
 	/*
+	 * Initialize CPU registers with task-specific and cpu-specific context.
+	 *
 	 * Create a final frame record at task_pt_regs(current)->stackframe, so
 	 * that the unwinder can identify the final frame record of any task by
 	 * its location in the task stack. We reserve the entire pt_regs space
 	 * for consistency with user tasks and kthreads.
 	 */
-	.macro setup_final_frame
+	.macro	init_cpu_task tsk, tmp
+	msr	sp_el0, \tsk
+
+	ldr	\tmp, [\tsk, #TSK_STACK]
+	add	sp, \tmp, #THREAD_SIZE
 	sub	sp, sp, #PT_REGS_SIZE
+
 	stp	xzr, xzr, [sp, #S_STACKFRAME]
 	add	x29, sp, #S_STACKFRAME
+
+	scs_load \tsk, \tmp
 	.endm
 
 /*
@@ -412,22 +421,16 @@ SYM_FUNC_END(__create_page_tables)
  *   x0 = __PHYS_OFFSET
  */
 SYM_FUNC_START_LOCAL(__primary_switched)
-	adrp	x4, init_thread_union
-	add	sp, x4, #THREAD_SIZE
-	adr_l	x5, init_task
-	msr	sp_el0, x5			// Save thread_info
+	adr_l	x4, init_task
+	init_cpu_task x4, x5
 
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
 	isb
 
-	stp	xzr, x30, [sp, #-16]!
+	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 
-#ifdef CONFIG_SHADOW_CALL_STACK
-	adr_l	scs_sp, init_shadow_call_stack	// Set shadow call stack
-#endif
-
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
 
 	ldr_l	x4, kimage_vaddr		// Save the offset between
@@ -459,8 +462,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 0:
 #endif
 	bl	switch_to_vhe			// Prefer VHE if possible
-	add	sp, sp, #16
-	setup_final_frame
+	ldp	x29, x30, [sp], #16
 	bl	start_kernel
 	ASM_BUG()
 SYM_FUNC_END(__primary_switched)
@@ -648,12 +650,7 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 	ldr	x2, [x0, #CPU_BOOT_TASK]
 	cbz	x2, __secondary_too_slow
 
-	ldr	x1, [x2, #TSK_STACK]
-	add	sp, x1, #THREAD_SIZE
-
-	msr	sp_el0, x2
-	scs_load x2, x3
-	setup_final_frame
+	init_cpu_task x2, x1
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 	ptrauth_keys_init_cpu x2, x3, x4, x5
-- 
cgit v1.2.3


From 3d8c1a013d78f32ee266097496cbd89b734b5fcb Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 20 May 2021 12:50:31 +0100
Subject: arm64: smp: initialize cpu offset earlier

Now that we have a consistent place to initialize CPU context registers
early in the boot path, let's also initialize the per-cpu offset here.
This makes the primary and secondary boot paths more consistent, and
allows for the use of per-cpu operations earlier, which will be
necessary for instrumentation with KCSAN.

Note that smp_prepare_boot_cpu() still needs to re-initialize CPU0's
offset as immediately prior to this the per-cpu areas may be
reallocated, and hence the boot-time offset may be stale. A comment is
added to make this clear.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210520115031.18509-7-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/asm-offsets.c |  1 +
 arch/arm64/kernel/head.S        | 17 +++++++++++------
 arch/arm64/kernel/setup.c       |  6 ------
 arch/arm64/kernel/smp.c         | 10 ++++++----
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 4a5e204c33af..bd0fc23d8719 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -27,6 +27,7 @@
 int main(void)
 {
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
+  DEFINE(TSK_CPU,		offsetof(struct task_struct, cpu));
   BLANK();
   DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
   DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index e83b2899dce5..070ed53c049d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -402,17 +402,22 @@ SYM_FUNC_END(__create_page_tables)
 	 * its location in the task stack. We reserve the entire pt_regs space
 	 * for consistency with user tasks and kthreads.
 	 */
-	.macro	init_cpu_task tsk, tmp
+	.macro	init_cpu_task tsk, tmp1, tmp2
 	msr	sp_el0, \tsk
 
-	ldr	\tmp, [\tsk, #TSK_STACK]
-	add	sp, \tmp, #THREAD_SIZE
+	ldr	\tmp1, [\tsk, #TSK_STACK]
+	add	sp, \tmp1, #THREAD_SIZE
 	sub	sp, sp, #PT_REGS_SIZE
 
 	stp	xzr, xzr, [sp, #S_STACKFRAME]
 	add	x29, sp, #S_STACKFRAME
 
-	scs_load \tsk, \tmp
+	scs_load \tsk, \tmp1
+
+	adr_l	\tmp1, __per_cpu_offset
+	ldr	w\tmp2, [\tsk, #TSK_CPU]
+	ldr	\tmp1, [\tmp1, \tmp2, lsl #3]
+	set_this_cpu_offset \tmp1
 	.endm
 
 /*
@@ -422,7 +427,7 @@ SYM_FUNC_END(__create_page_tables)
  */
 SYM_FUNC_START_LOCAL(__primary_switched)
 	adr_l	x4, init_task
-	init_cpu_task x4, x5
+	init_cpu_task x4, x5, x6
 
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
@@ -650,7 +655,7 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 	ldr	x2, [x0, #CPU_BOOT_TASK]
 	cbz	x2, __secondary_too_slow
 
-	init_cpu_task x2, x1
+	init_cpu_task x2, x1, x3
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 	ptrauth_keys_init_cpu x2, x3, x4, x5
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 61845c0821d9..b7a35a03e9b9 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -87,12 +87,6 @@ void __init smp_setup_processor_id(void)
 	u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
 	set_cpu_logical_map(0, mpidr);
 
-	/*
-	 * clear __my_cpu_offset on boot CPU to avoid hang caused by
-	 * using percpu variable early, for example, lockdep will
-	 * access percpu variable inside lock_release
-	 */
-	set_my_cpu_offset(0);
 	pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n",
 		(unsigned long)mpidr, read_cpuid_id());
 }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 73625cc39574..2fe8fab886e2 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -198,10 +198,7 @@ asmlinkage notrace void secondary_start_kernel(void)
 	u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
 	struct mm_struct *mm = &init_mm;
 	const struct cpu_operations *ops;
-	unsigned int cpu;
-
-	cpu = task_cpu(current);
-	set_my_cpu_offset(per_cpu_offset(cpu));
+	unsigned int cpu = smp_processor_id();
 
 	/*
 	 * All kernel threads share the same mm context; grab a
@@ -448,6 +445,11 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 void __init smp_prepare_boot_cpu(void)
 {
+	/*
+	 * The runtime per-cpu areas have been allocated by
+	 * setup_per_cpu_areas(), and CPU0's boot time per-cpu area will be
+	 * freed shortly, so we must move over to the runtime per-cpu area.
+	 */
 	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 	cpuinfo_store_boot_cpu();
 
-- 
cgit v1.2.3


From 1cbdf60bd1b74e397d48aa877367cfc621f45ffe Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 26 May 2021 10:49:27 -0700
Subject: kasan: arm64: support specialized outlined tag mismatch checks

By using outlined checks we can achieve a significant code size
improvement by moving the tag-based ASAN checks into separate
functions. Unlike the existing CONFIG_KASAN_OUTLINE mode these
functions have a custom calling convention that preserves most
registers and is specialized to the register containing the address
and the type of access, and as a result we can eliminate the code
size and performance overhead of a standard calling convention such
as AAPCS for these functions.

This change depends on a separate series of changes to Clang [1] to
support outlined checks in the kernel, although the change works fine
without them (we just don't get outlined checks). This is because the
flag -mllvm -hwasan-inline-all-checks=0 has no effect until the Clang
changes land. The flag was introduced in the Clang 9.0 timeframe as
part of the support for outlined checks in userspace and because our
minimum Clang version is 10.0 we can pass it unconditionally.

Outlined checks require a new runtime function with a custom calling
convention. Add this function to arch/arm64/lib.

I measured the code size of defconfig + tag-based KASAN, as well
as boot time (i.e. time to init launch) on a DragonBoard 845c with
an Android arm64 GKI kernel. The results are below:

                               code size    boot time
CONFIG_KASAN_INLINE=y before    92824064      6.18s
CONFIG_KASAN_INLINE=y after     38822400      6.65s
CONFIG_KASAN_OUTLINE=y          39215616     11.48s

We can see straight away that specialized outlined checks beat the
existing CONFIG_KASAN_OUTLINE=y on both code size and boot time
for tag-based ASAN.

As for the comparison between CONFIG_KASAN_INLINE=y before and after
we saw similar performance numbers in userspace [2] and decided
that since the performance overhead is minimal compared to the
overhead of tag-based ASAN itself as well as compared to the code
size improvements we would just replace the inlined checks with the
specialized outlined checks without the option to select between them,
and that is what I have implemented in this patch.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Link: https://linux-review.googlesource.com/id/I1a30036c70ab3c3ee78d75ed9b87ef7cdc3fdb76
Link: [1] https://reviews.llvm.org/D90426
Link: [2] https://reviews.llvm.org/D56954
Link: https://lore.kernel.org/r/20210526174927.2477847-3-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/asm-prototypes.h |  6 +++
 arch/arm64/include/asm/module.lds.h     | 17 +++++++-
 arch/arm64/lib/Makefile                 |  2 +
 arch/arm64/lib/kasan_sw_tags.S          | 76 +++++++++++++++++++++++++++++++++
 mm/kasan/sw_tags.c                      |  7 +++
 scripts/Makefile.kasan                  |  1 +
 6 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/lib/kasan_sw_tags.S

diff --git a/arch/arm64/include/asm/asm-prototypes.h b/arch/arm64/include/asm/asm-prototypes.h
index 1c9a3a0c5fa5..ec1d9655f885 100644
--- a/arch/arm64/include/asm/asm-prototypes.h
+++ b/arch/arm64/include/asm/asm-prototypes.h
@@ -23,4 +23,10 @@ long long __ashlti3(long long a, int b);
 long long __ashrti3(long long a, int b);
 long long __lshrti3(long long a, int b);
 
+/*
+ * This function uses a custom calling convention and cannot be called from C so
+ * this prototype is not entirely accurate.
+ */
+void __hwasan_tag_mismatch(unsigned long addr, unsigned long access_info);
+
 #endif /* __ASM_PROTOTYPES_H */
diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h
index 810045628c66..a11ccadd47d2 100644
--- a/arch/arm64/include/asm/module.lds.h
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,7 +1,20 @@
-#ifdef CONFIG_ARM64_MODULE_PLTS
 SECTIONS {
+#ifdef CONFIG_ARM64_MODULE_PLTS
 	.plt 0 (NOLOAD) : { BYTE(0) }
 	.init.plt 0 (NOLOAD) : { BYTE(0) }
 	.text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) }
-}
 #endif
+
+#ifdef CONFIG_KASAN_SW_TAGS
+	/*
+	 * Outlined checks go into comdat-deduplicated sections named .text.hot.
+	 * Because they are in comdats they are not combined by the linker and
+	 * we otherwise end up with multiple sections with the same .text.hot
+	 * name in the .ko file. The kernel module loader warns if it sees
+	 * multiple sections with the same name so we use this sections
+	 * directive to force them into a single section and silence the
+	 * warning.
+	 */
+	.text.hot : { *(.text.hot) }
+#endif
+}
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..8e60d76a1b47 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_SW_TAGS) += kasan_sw_tags.o
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
new file mode 100644
index 000000000000..5b04464c045e
--- /dev/null
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Report a tag mismatch detected by tag-based KASAN.
+ *
+ * A compiler-generated thunk calls this with a non-AAPCS calling
+ * convention. Upon entry to this function, registers are as follows:
+ *
+ * x0:         fault address (see below for restore)
+ * x1:         fault description (see below for restore)
+ * x2 to x15:  callee-saved
+ * x16 to x17: safe to clobber
+ * x18 to x30: callee-saved
+ * sp:         pre-decremented by 256 bytes (see below for restore)
+ *
+ * The caller has decremented the SP by 256 bytes, and created a
+ * structure on the stack as follows:
+ *
+ * sp + 0..15:    x0 and x1 to be restored
+ * sp + 16..231:  free for use
+ * sp + 232..247: x29 and x30 (same as in GPRs)
+ * sp + 248..255: free for use
+ *
+ * Note that this is not a struct pt_regs.
+ *
+ * To call a regular AAPCS function we must save x2 to x15 (which we can
+ * store in the gaps), and create a frame record (for which we can use
+ * x29 and x30 spilled by the caller as those match the GPRs).
+ *
+ * The caller expects x0 and x1 to be restored from the structure, and
+ * for the structure to be removed from the stack (i.e. the SP must be
+ * incremented by 256 prior to return).
+ */
+SYM_CODE_START(__hwasan_tag_mismatch)
+#ifdef BTI_C
+	BTI_C
+#endif
+	add	x29, sp, #232
+	stp	x2, x3, [sp, #8 * 2]
+	stp	x4, x5, [sp, #8 * 4]
+	stp	x6, x7, [sp, #8 * 6]
+	stp	x8, x9, [sp, #8 * 8]
+	stp	x10, x11, [sp, #8 * 10]
+	stp	x12, x13, [sp, #8 * 12]
+	stp	x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+	str	x18, [sp, #8 * 18]
+#endif
+
+	mov	x2, x30
+	bl	kasan_tag_mismatch
+
+	ldp	x0, x1, [sp]
+	ldp	x2, x3, [sp, #8 * 2]
+	ldp	x4, x5, [sp, #8 * 4]
+	ldp	x6, x7, [sp, #8 * 6]
+	ldp	x8, x9, [sp, #8 * 8]
+	ldp	x10, x11, [sp, #8 * 10]
+	ldp	x12, x13, [sp, #8 * 12]
+	ldp	x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+	ldr	x18, [sp, #8 * 18]
+#endif
+	ldp	x29, x30, [sp, #8 * 29]
+
+	/* remove the structure from the stack */
+	add	sp, sp, #256
+	ret
+SYM_CODE_END(__hwasan_tag_mismatch)
+EXPORT_SYMBOL(__hwasan_tag_mismatch)
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 9df8e7f69e87..9362938abbfa 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -207,3 +207,10 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
 
 	return &alloc_meta->free_track[i];
 }
+
+void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+			unsigned long ret_ip)
+{
+	kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
+		     ret_ip);
+}
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 3d791908ed36..801c415bac59 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -50,6 +50,7 @@ endif
 CFLAGS_KASAN := -fsanitize=kernel-hwaddress \
 		$(call cc-param,hwasan-instrument-stack=$(stack_enable)) \
 		$(call cc-param,hwasan-use-short-granules=0) \
+		$(call cc-param,hwasan-inline-all-checks=0) \
 		$(instrumentation_flags)
 
 endif # CONFIG_KASAN_SW_TAGS
-- 
cgit v1.2.3


From 5f154c4e20d7edd38bddec78f3e0a7628057ef76 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:29 +0100
Subject: arm64: Move patching utilities out of instruction encoding/decoding

Files insn.[c|h] containt some functions used for instruction patching.
In order to reuse the instruction encoder/decoder, move the patching
utilities to their own file.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-2-jthierry@redhat.com
[will: Include patching.h in insn.h to fix header mess; add __ASSEMBLY__ guards]
Signed-off-by: Will Deacon <will@kernel.org>

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h     |   6 +-
 arch/arm64/include/asm/patching.h |  15 ++++
 arch/arm64/kernel/Makefile        |   2 +-
 arch/arm64/kernel/insn.c          | 149 +-------------------------------------
 arch/arm64/kernel/patching.c      | 148 +++++++++++++++++++++++++++++++++++++
 5 files changed, 168 insertions(+), 152 deletions(-)
 create mode 100644 arch/arm64/include/asm/patching.h
 create mode 100644 arch/arm64/kernel/patching.c

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 4ebb9c054ccc..f08579e5119e 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 
 #include <asm/alternative.h>
+#include <asm/patching.h>
 
 #ifndef __ASSEMBLY__
 /*
@@ -379,8 +380,6 @@ static inline bool aarch64_insn_is_adr_adrp(u32 insn)
 	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
 }
 
-int aarch64_insn_read(void *addr, u32 *insnp);
-int aarch64_insn_write(void *addr, u32 insn);
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);
@@ -487,9 +486,6 @@ u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
 s32 aarch64_get_branch_offset(u32 insn);
 u32 aarch64_set_branch_offset(u32 insn, s32 offset);
 
-int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
-int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
-
 s32 aarch64_insn_adrp_get_offset(u32 insn);
 u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset);
 
diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h
new file mode 100644
index 000000000000..5ebab129222f
--- /dev/null
+++ b/arch/arm64/include/asm/patching.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef	__ASM_PATCHING_H
+#define	__ASM_PATCHING_H
+
+#include <linux/types.h>
+
+#ifndef __ASSEMBLY__
+int aarch64_insn_read(void *addr, u32 *insnp);
+int aarch64_insn_write(void *addr, u32 insn);
+
+int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
+int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
+#endif /* __ASSEMBLY__ */
+
+#endif	/* __ASM_PATCHING_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6cc97730790e..3693156acc75 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -22,7 +22,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o idreg-override.o
+			   syscall.o proton-pack.o idreg-override.o patching.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 6c0de2f60ea9..952e7d6fe60e 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -7,21 +7,14 @@
  */
 #include <linux/bitops.h>
 #include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/spinlock.h>
-#include <linux/stop_machine.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
 #include <linux/types.h>
-#include <linux/uaccess.h>
 
-#include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
-#include <asm/fixmap.h>
+#include <asm/errno.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
-#include <asm/sections.h>
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
 #define AARCH64_INSN_N_BIT	BIT(22)
@@ -83,81 +76,6 @@ bool aarch64_insn_is_branch_imm(u32 insn)
 		aarch64_insn_is_bcond(insn));
 }
 
-static DEFINE_RAW_SPINLOCK(patch_lock);
-
-static bool is_exit_text(unsigned long addr)
-{
-	/* discarded with init text/data */
-	return system_state < SYSTEM_RUNNING &&
-		addr >= (unsigned long)__exittext_begin &&
-		addr < (unsigned long)__exittext_end;
-}
-
-static bool is_image_text(unsigned long addr)
-{
-	return core_kernel_text(addr) || is_exit_text(addr);
-}
-
-static void __kprobes *patch_map(void *addr, int fixmap)
-{
-	unsigned long uintaddr = (uintptr_t) addr;
-	bool image = is_image_text(uintaddr);
-	struct page *page;
-
-	if (image)
-		page = phys_to_page(__pa_symbol(addr));
-	else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
-		page = vmalloc_to_page(addr);
-	else
-		return addr;
-
-	BUG_ON(!page);
-	return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
-			(uintaddr & ~PAGE_MASK));
-}
-
-static void __kprobes patch_unmap(int fixmap)
-{
-	clear_fixmap(fixmap);
-}
-/*
- * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
- * little-endian.
- */
-int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
-{
-	int ret;
-	__le32 val;
-
-	ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
-	if (!ret)
-		*insnp = le32_to_cpu(val);
-
-	return ret;
-}
-
-static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
-{
-	void *waddr = addr;
-	unsigned long flags = 0;
-	int ret;
-
-	raw_spin_lock_irqsave(&patch_lock, flags);
-	waddr = patch_map(addr, FIX_TEXT_POKE0);
-
-	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
-
-	patch_unmap(FIX_TEXT_POKE0);
-	raw_spin_unlock_irqrestore(&patch_lock, flags);
-
-	return ret;
-}
-
-int __kprobes aarch64_insn_write(void *addr, u32 insn)
-{
-	return __aarch64_insn_write(addr, cpu_to_le32(insn));
-}
-
 bool __kprobes aarch64_insn_uses_literal(u32 insn)
 {
 	/* ldr/ldrsw (literal), prfm */
@@ -187,67 +105,6 @@ bool __kprobes aarch64_insn_is_branch(u32 insn)
 		aarch64_insn_is_bcond(insn);
 }
 
-int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
-{
-	u32 *tp = addr;
-	int ret;
-
-	/* A64 instructions must be word aligned */
-	if ((uintptr_t)tp & 0x3)
-		return -EINVAL;
-
-	ret = aarch64_insn_write(tp, insn);
-	if (ret == 0)
-		__flush_icache_range((uintptr_t)tp,
-				     (uintptr_t)tp + AARCH64_INSN_SIZE);
-
-	return ret;
-}
-
-struct aarch64_insn_patch {
-	void		**text_addrs;
-	u32		*new_insns;
-	int		insn_cnt;
-	atomic_t	cpu_count;
-};
-
-static int __kprobes aarch64_insn_patch_text_cb(void *arg)
-{
-	int i, ret = 0;
-	struct aarch64_insn_patch *pp = arg;
-
-	/* The first CPU becomes master */
-	if (atomic_inc_return(&pp->cpu_count) == 1) {
-		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
-			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
-							     pp->new_insns[i]);
-		/* Notify other processors with an additional increment. */
-		atomic_inc(&pp->cpu_count);
-	} else {
-		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
-			cpu_relax();
-		isb();
-	}
-
-	return ret;
-}
-
-int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
-{
-	struct aarch64_insn_patch patch = {
-		.text_addrs = addrs,
-		.new_insns = insns,
-		.insn_cnt = cnt,
-		.cpu_count = ATOMIC_INIT(0),
-	};
-
-	if (cnt <= 0)
-		return -EINVAL;
-
-	return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
-				       cpu_online_mask);
-}
-
 static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
 						u32 *maskp, int *shiftp)
 {
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
new file mode 100644
index 000000000000..9d050e33901b
--- /dev/null
+++ b/arch/arm64/kernel/patching.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+#include <asm/kprobes.h>
+#include <asm/sections.h>
+
+static DEFINE_RAW_SPINLOCK(patch_lock);
+
+static bool is_exit_text(unsigned long addr)
+{
+	/* discarded with init text/data */
+	return system_state < SYSTEM_RUNNING &&
+		addr >= (unsigned long)__exittext_begin &&
+		addr < (unsigned long)__exittext_end;
+}
+
+static bool is_image_text(unsigned long addr)
+{
+	return core_kernel_text(addr) || is_exit_text(addr);
+}
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+	unsigned long uintaddr = (uintptr_t) addr;
+	bool image = is_image_text(uintaddr);
+	struct page *page;
+
+	if (image)
+		page = phys_to_page(__pa_symbol(addr));
+	else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		page = vmalloc_to_page(addr);
+	else
+		return addr;
+
+	BUG_ON(!page);
+	return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
+			(uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+	clear_fixmap(fixmap);
+}
+/*
+ * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
+ * little-endian.
+ */
+int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
+{
+	int ret;
+	__le32 val;
+
+	ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
+	if (!ret)
+		*insnp = le32_to_cpu(val);
+
+	return ret;
+}
+
+static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
+{
+	void *waddr = addr;
+	unsigned long flags = 0;
+	int ret;
+
+	raw_spin_lock_irqsave(&patch_lock, flags);
+	waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
+
+	patch_unmap(FIX_TEXT_POKE0);
+	raw_spin_unlock_irqrestore(&patch_lock, flags);
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_write(void *addr, u32 insn)
+{
+	return __aarch64_insn_write(addr, cpu_to_le32(insn));
+}
+
+int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
+{
+	u32 *tp = addr;
+	int ret;
+
+	/* A64 instructions must be word aligned */
+	if ((uintptr_t)tp & 0x3)
+		return -EINVAL;
+
+	ret = aarch64_insn_write(tp, insn);
+	if (ret == 0)
+		__flush_icache_range((uintptr_t)tp,
+				     (uintptr_t)tp + AARCH64_INSN_SIZE);
+
+	return ret;
+}
+
+struct aarch64_insn_patch {
+	void		**text_addrs;
+	u32		*new_insns;
+	int		insn_cnt;
+	atomic_t	cpu_count;
+};
+
+static int __kprobes aarch64_insn_patch_text_cb(void *arg)
+{
+	int i, ret = 0;
+	struct aarch64_insn_patch *pp = arg;
+
+	/* The first CPU becomes master */
+	if (atomic_inc_return(&pp->cpu_count) == 1) {
+		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
+			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
+							     pp->new_insns[i]);
+		/* Notify other processors with an additional increment. */
+		atomic_inc(&pp->cpu_count);
+	} else {
+		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
+			cpu_relax();
+		isb();
+	}
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
+{
+	struct aarch64_insn_patch patch = {
+		.text_addrs = addrs,
+		.new_insns = insns,
+		.insn_cnt = cnt,
+		.cpu_count = ATOMIC_INIT(0),
+	};
+
+	if (cnt <= 0)
+		return -EINVAL;
+
+	return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
+				       cpu_online_mask);
+}
-- 
cgit v1.2.3


From 633e5e938fea957577e6db33540a78debf0c5cbe Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:30 +0100
Subject: arm64: Move aarch32 condition check functions

The functions to check condition flags for aarch32 execution is only
used to emulate aarch32 instructions. Move them from the instruction
encoding/decoding code to the trap handling files.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-3-jthierry@redhat.com
[will: leave aarch32_opcode_cond_checks where it is]
Signed-off-by: Will Deacon <will@kernel.org>

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h            |  1 +
 arch/arm64/kernel/insn.c                 | 98 -------------------------------
 arch/arm64/kernel/probes/simulate-insn.c |  1 +
 arch/arm64/kernel/traps.c                | 99 +++++++++++++++++++++++++++++++-
 4 files changed, 100 insertions(+), 99 deletions(-)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index f08579e5119e..7adc4398fadb 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -502,6 +502,7 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn);
 
 typedef bool (pstate_check_t)(unsigned long);
 extern pstate_check_t * const aarch32_opcode_cond_checks[16];
+
 #endif /* __ASSEMBLY__ */
 
 #endif	/* __ASM_INSN_H */
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 952e7d6fe60e..6ff8826ae7ea 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -1289,104 +1289,6 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn)
 	return insn & CRM_MASK;
 }
 
-static bool __kprobes __check_eq(unsigned long pstate)
-{
-	return (pstate & PSR_Z_BIT) != 0;
-}
-
-static bool __kprobes __check_ne(unsigned long pstate)
-{
-	return (pstate & PSR_Z_BIT) == 0;
-}
-
-static bool __kprobes __check_cs(unsigned long pstate)
-{
-	return (pstate & PSR_C_BIT) != 0;
-}
-
-static bool __kprobes __check_cc(unsigned long pstate)
-{
-	return (pstate & PSR_C_BIT) == 0;
-}
-
-static bool __kprobes __check_mi(unsigned long pstate)
-{
-	return (pstate & PSR_N_BIT) != 0;
-}
-
-static bool __kprobes __check_pl(unsigned long pstate)
-{
-	return (pstate & PSR_N_BIT) == 0;
-}
-
-static bool __kprobes __check_vs(unsigned long pstate)
-{
-	return (pstate & PSR_V_BIT) != 0;
-}
-
-static bool __kprobes __check_vc(unsigned long pstate)
-{
-	return (pstate & PSR_V_BIT) == 0;
-}
-
-static bool __kprobes __check_hi(unsigned long pstate)
-{
-	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
-	return (pstate & PSR_C_BIT) != 0;
-}
-
-static bool __kprobes __check_ls(unsigned long pstate)
-{
-	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
-	return (pstate & PSR_C_BIT) == 0;
-}
-
-static bool __kprobes __check_ge(unsigned long pstate)
-{
-	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
-	return (pstate & PSR_N_BIT) == 0;
-}
-
-static bool __kprobes __check_lt(unsigned long pstate)
-{
-	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
-	return (pstate & PSR_N_BIT) != 0;
-}
-
-static bool __kprobes __check_gt(unsigned long pstate)
-{
-	/*PSR_N_BIT ^= PSR_V_BIT */
-	unsigned long temp = pstate ^ (pstate << 3);
-
-	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
-	return (temp & PSR_N_BIT) == 0;
-}
-
-static bool __kprobes __check_le(unsigned long pstate)
-{
-	/*PSR_N_BIT ^= PSR_V_BIT */
-	unsigned long temp = pstate ^ (pstate << 3);
-
-	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
-	return (temp & PSR_N_BIT) != 0;
-}
-
-static bool __kprobes __check_al(unsigned long pstate)
-{
-	return true;
-}
-
-/*
- * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
- * it behaves identically to 0b1110 ("al").
- */
-pstate_check_t * const aarch32_opcode_cond_checks[16] = {
-	__check_eq, __check_ne, __check_cs, __check_cc,
-	__check_mi, __check_pl, __check_vs, __check_vc,
-	__check_hi, __check_ls, __check_ge, __check_lt,
-	__check_gt, __check_le, __check_al, __check_al
-};
-
 static bool range_of_ones(u64 val)
 {
 	/* Doesn't handle full ones or full zeroes */
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
index 25f67ec59635..22d0b3252476 100644
--- a/arch/arm64/kernel/probes/simulate-insn.c
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -10,6 +10,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/ptrace.h>
+#include <asm/traps.h>
 
 #include "simulate-insn.h"
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index a05d34f0e82a..9b683b2381cf 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -36,7 +36,6 @@
 #include <asm/esr.h>
 #include <asm/exception.h>
 #include <asm/extable.h>
-#include <asm/insn.h>
 #include <asm/kprobes.h>
 #include <asm/traps.h>
 #include <asm/smp.h>
@@ -45,6 +44,104 @@
 #include <asm/system_misc.h>
 #include <asm/sysreg.h>
 
+static bool __kprobes __check_eq(unsigned long pstate)
+{
+	return (pstate & PSR_Z_BIT) != 0;
+}
+
+static bool __kprobes __check_ne(unsigned long pstate)
+{
+	return (pstate & PSR_Z_BIT) == 0;
+}
+
+static bool __kprobes __check_cs(unsigned long pstate)
+{
+	return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_cc(unsigned long pstate)
+{
+	return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_mi(unsigned long pstate)
+{
+	return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_pl(unsigned long pstate)
+{
+	return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_vs(unsigned long pstate)
+{
+	return (pstate & PSR_V_BIT) != 0;
+}
+
+static bool __kprobes __check_vc(unsigned long pstate)
+{
+	return (pstate & PSR_V_BIT) == 0;
+}
+
+static bool __kprobes __check_hi(unsigned long pstate)
+{
+	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
+	return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_ls(unsigned long pstate)
+{
+	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
+	return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_ge(unsigned long pstate)
+{
+	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
+	return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_lt(unsigned long pstate)
+{
+	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
+	return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_gt(unsigned long pstate)
+{
+	/*PSR_N_BIT ^= PSR_V_BIT */
+	unsigned long temp = pstate ^ (pstate << 3);
+
+	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
+	return (temp & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_le(unsigned long pstate)
+{
+	/*PSR_N_BIT ^= PSR_V_BIT */
+	unsigned long temp = pstate ^ (pstate << 3);
+
+	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
+	return (temp & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_al(unsigned long pstate)
+{
+	return true;
+}
+
+/*
+ * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
+ * it behaves identically to 0b1110 ("al").
+ */
+pstate_check_t * const aarch32_opcode_cond_checks[16] = {
+	__check_eq, __check_ne, __check_cs, __check_cc,
+	__check_mi, __check_pl, __check_vs, __check_vc,
+	__check_hi, __check_ls, __check_ge, __check_lt,
+	__check_gt, __check_le, __check_al, __check_al
+};
+
 static const char *handler[] = {
 	"Synchronous Abort",
 	"IRQ",
-- 
cgit v1.2.3


From 72fd723694b6f4f1d1f19f673fb93801d7d1a0e8 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:32 +0100
Subject: arm64: Move instruction encoder/decoder under lib/

Aarch64 instruction set encoding and decoding logic can prove useful
for some features/tools both part of the kernel and outside the kernel.

Isolate the function dealing only with encoding/decoding instructions,
with minimal dependency on kernel utilities in order to be able to reuse
that code.

Code was only moved, no code should have been added, removed nor
modifier.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-5-jthierry@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/Makefile |    2 +-
 arch/arm64/kernel/insn.c   | 1458 --------------------------------------------
 arch/arm64/lib/Makefile    |    6 +-
 arch/arm64/lib/insn.c      | 1458 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1462 insertions(+), 1462 deletions(-)
 delete mode 100644 arch/arm64/kernel/insn.c
 create mode 100644 arch/arm64/lib/insn.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 3693156acc75..03e8311ce576 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_syscall.o	+= -fno-stack-protector
 obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   entry-common.o entry-fpsimd.o process.o ptrace.o	\
 			   setup.o signal.o sys.o stacktrace.o time.o traps.o	\
-			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o insn.o	\
+			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
deleted file mode 100644
index 6ff8826ae7ea..000000000000
--- a/arch/arm64/kernel/insn.c
+++ /dev/null
@@ -1,1458 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2013 Huawei Ltd.
- * Author: Jiang Liu <liuj97@gmail.com>
- *
- * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
- */
-#include <linux/bitops.h>
-#include <linux/bug.h>
-#include <linux/printk.h>
-#include <linux/sizes.h>
-#include <linux/types.h>
-
-#include <asm/debug-monitors.h>
-#include <asm/errno.h>
-#include <asm/insn.h>
-#include <asm/kprobes.h>
-
-#define AARCH64_INSN_SF_BIT	BIT(31)
-#define AARCH64_INSN_N_BIT	BIT(22)
-#define AARCH64_INSN_LSL_12	BIT(22)
-
-static const int aarch64_insn_encoding_class[] = {
-	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_LDST,
-	AARCH64_INSN_CLS_DP_REG,
-	AARCH64_INSN_CLS_LDST,
-	AARCH64_INSN_CLS_DP_FPSIMD,
-	AARCH64_INSN_CLS_DP_IMM,
-	AARCH64_INSN_CLS_DP_IMM,
-	AARCH64_INSN_CLS_BR_SYS,
-	AARCH64_INSN_CLS_BR_SYS,
-	AARCH64_INSN_CLS_LDST,
-	AARCH64_INSN_CLS_DP_REG,
-	AARCH64_INSN_CLS_LDST,
-	AARCH64_INSN_CLS_DP_FPSIMD,
-};
-
-enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn)
-{
-	return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
-}
-
-bool __kprobes aarch64_insn_is_steppable_hint(u32 insn)
-{
-	if (!aarch64_insn_is_hint(insn))
-		return false;
-
-	switch (insn & 0xFE0) {
-	case AARCH64_INSN_HINT_XPACLRI:
-	case AARCH64_INSN_HINT_PACIA_1716:
-	case AARCH64_INSN_HINT_PACIB_1716:
-	case AARCH64_INSN_HINT_PACIAZ:
-	case AARCH64_INSN_HINT_PACIASP:
-	case AARCH64_INSN_HINT_PACIBZ:
-	case AARCH64_INSN_HINT_PACIBSP:
-	case AARCH64_INSN_HINT_BTI:
-	case AARCH64_INSN_HINT_BTIC:
-	case AARCH64_INSN_HINT_BTIJ:
-	case AARCH64_INSN_HINT_BTIJC:
-	case AARCH64_INSN_HINT_NOP:
-		return true;
-	default:
-		return false;
-	}
-}
-
-bool aarch64_insn_is_branch_imm(u32 insn)
-{
-	return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
-		aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
-		aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
-		aarch64_insn_is_bcond(insn));
-}
-
-bool __kprobes aarch64_insn_uses_literal(u32 insn)
-{
-	/* ldr/ldrsw (literal), prfm */
-
-	return aarch64_insn_is_ldr_lit(insn) ||
-		aarch64_insn_is_ldrsw_lit(insn) ||
-		aarch64_insn_is_adr_adrp(insn) ||
-		aarch64_insn_is_prfm_lit(insn);
-}
-
-bool __kprobes aarch64_insn_is_branch(u32 insn)
-{
-	/* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */
-
-	return aarch64_insn_is_b(insn) ||
-		aarch64_insn_is_bl(insn) ||
-		aarch64_insn_is_cbz(insn) ||
-		aarch64_insn_is_cbnz(insn) ||
-		aarch64_insn_is_tbz(insn) ||
-		aarch64_insn_is_tbnz(insn) ||
-		aarch64_insn_is_ret(insn) ||
-		aarch64_insn_is_ret_auth(insn) ||
-		aarch64_insn_is_br(insn) ||
-		aarch64_insn_is_br_auth(insn) ||
-		aarch64_insn_is_blr(insn) ||
-		aarch64_insn_is_blr_auth(insn) ||
-		aarch64_insn_is_bcond(insn);
-}
-
-static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
-						u32 *maskp, int *shiftp)
-{
-	u32 mask;
-	int shift;
-
-	switch (type) {
-	case AARCH64_INSN_IMM_26:
-		mask = BIT(26) - 1;
-		shift = 0;
-		break;
-	case AARCH64_INSN_IMM_19:
-		mask = BIT(19) - 1;
-		shift = 5;
-		break;
-	case AARCH64_INSN_IMM_16:
-		mask = BIT(16) - 1;
-		shift = 5;
-		break;
-	case AARCH64_INSN_IMM_14:
-		mask = BIT(14) - 1;
-		shift = 5;
-		break;
-	case AARCH64_INSN_IMM_12:
-		mask = BIT(12) - 1;
-		shift = 10;
-		break;
-	case AARCH64_INSN_IMM_9:
-		mask = BIT(9) - 1;
-		shift = 12;
-		break;
-	case AARCH64_INSN_IMM_7:
-		mask = BIT(7) - 1;
-		shift = 15;
-		break;
-	case AARCH64_INSN_IMM_6:
-	case AARCH64_INSN_IMM_S:
-		mask = BIT(6) - 1;
-		shift = 10;
-		break;
-	case AARCH64_INSN_IMM_R:
-		mask = BIT(6) - 1;
-		shift = 16;
-		break;
-	case AARCH64_INSN_IMM_N:
-		mask = 1;
-		shift = 22;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	*maskp = mask;
-	*shiftp = shift;
-
-	return 0;
-}
-
-#define ADR_IMM_HILOSPLIT	2
-#define ADR_IMM_SIZE		SZ_2M
-#define ADR_IMM_LOMASK		((1 << ADR_IMM_HILOSPLIT) - 1)
-#define ADR_IMM_HIMASK		((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
-#define ADR_IMM_LOSHIFT		29
-#define ADR_IMM_HISHIFT		5
-
-u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
-{
-	u32 immlo, immhi, mask;
-	int shift;
-
-	switch (type) {
-	case AARCH64_INSN_IMM_ADR:
-		shift = 0;
-		immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
-		immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
-		insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
-		mask = ADR_IMM_SIZE - 1;
-		break;
-	default:
-		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
-			pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n",
-			       type);
-			return 0;
-		}
-	}
-
-	return (insn >> shift) & mask;
-}
-
-u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
-				  u32 insn, u64 imm)
-{
-	u32 immlo, immhi, mask;
-	int shift;
-
-	if (insn == AARCH64_BREAK_FAULT)
-		return AARCH64_BREAK_FAULT;
-
-	switch (type) {
-	case AARCH64_INSN_IMM_ADR:
-		shift = 0;
-		immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
-		imm >>= ADR_IMM_HILOSPLIT;
-		immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
-		imm = immlo | immhi;
-		mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
-			(ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
-		break;
-	default:
-		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
-			pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
-			       type);
-			return AARCH64_BREAK_FAULT;
-		}
-	}
-
-	/* Update the immediate field. */
-	insn &= ~(mask << shift);
-	insn |= (imm & mask) << shift;
-
-	return insn;
-}
-
-u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
-					u32 insn)
-{
-	int shift;
-
-	switch (type) {
-	case AARCH64_INSN_REGTYPE_RT:
-	case AARCH64_INSN_REGTYPE_RD:
-		shift = 0;
-		break;
-	case AARCH64_INSN_REGTYPE_RN:
-		shift = 5;
-		break;
-	case AARCH64_INSN_REGTYPE_RT2:
-	case AARCH64_INSN_REGTYPE_RA:
-		shift = 10;
-		break;
-	case AARCH64_INSN_REGTYPE_RM:
-		shift = 16;
-		break;
-	default:
-		pr_err("%s: unknown register type encoding %d\n", __func__,
-		       type);
-		return 0;
-	}
-
-	return (insn >> shift) & GENMASK(4, 0);
-}
-
-static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
-					u32 insn,
-					enum aarch64_insn_register reg)
-{
-	int shift;
-
-	if (insn == AARCH64_BREAK_FAULT)
-		return AARCH64_BREAK_FAULT;
-
-	if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
-		pr_err("%s: unknown register encoding %d\n", __func__, reg);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (type) {
-	case AARCH64_INSN_REGTYPE_RT:
-	case AARCH64_INSN_REGTYPE_RD:
-		shift = 0;
-		break;
-	case AARCH64_INSN_REGTYPE_RN:
-		shift = 5;
-		break;
-	case AARCH64_INSN_REGTYPE_RT2:
-	case AARCH64_INSN_REGTYPE_RA:
-		shift = 10;
-		break;
-	case AARCH64_INSN_REGTYPE_RM:
-	case AARCH64_INSN_REGTYPE_RS:
-		shift = 16;
-		break;
-	default:
-		pr_err("%s: unknown register type encoding %d\n", __func__,
-		       type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn &= ~(GENMASK(4, 0) << shift);
-	insn |= reg << shift;
-
-	return insn;
-}
-
-static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
-					 u32 insn)
-{
-	u32 size;
-
-	switch (type) {
-	case AARCH64_INSN_SIZE_8:
-		size = 0;
-		break;
-	case AARCH64_INSN_SIZE_16:
-		size = 1;
-		break;
-	case AARCH64_INSN_SIZE_32:
-		size = 2;
-		break;
-	case AARCH64_INSN_SIZE_64:
-		size = 3;
-		break;
-	default:
-		pr_err("%s: unknown size encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn &= ~GENMASK(31, 30);
-	insn |= size << 30;
-
-	return insn;
-}
-
-static inline long branch_imm_common(unsigned long pc, unsigned long addr,
-				     long range)
-{
-	long offset;
-
-	if ((pc & 0x3) || (addr & 0x3)) {
-		pr_err("%s: A64 instructions must be word aligned\n", __func__);
-		return range;
-	}
-
-	offset = ((long)addr - (long)pc);
-
-	if (offset < -range || offset >= range) {
-		pr_err("%s: offset out of range\n", __func__);
-		return range;
-	}
-
-	return offset;
-}
-
-u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
-					  enum aarch64_insn_branch_type type)
-{
-	u32 insn;
-	long offset;
-
-	/*
-	 * B/BL support [-128M, 128M) offset
-	 * ARM64 virtual address arrangement guarantees all kernel and module
-	 * texts are within +/-128M.
-	 */
-	offset = branch_imm_common(pc, addr, SZ_128M);
-	if (offset >= SZ_128M)
-		return AARCH64_BREAK_FAULT;
-
-	switch (type) {
-	case AARCH64_INSN_BRANCH_LINK:
-		insn = aarch64_insn_get_bl_value();
-		break;
-	case AARCH64_INSN_BRANCH_NOLINK:
-		insn = aarch64_insn_get_b_value();
-		break;
-	default:
-		pr_err("%s: unknown branch encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
-					     offset >> 2);
-}
-
-u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
-				     enum aarch64_insn_register reg,
-				     enum aarch64_insn_variant variant,
-				     enum aarch64_insn_branch_type type)
-{
-	u32 insn;
-	long offset;
-
-	offset = branch_imm_common(pc, addr, SZ_1M);
-	if (offset >= SZ_1M)
-		return AARCH64_BREAK_FAULT;
-
-	switch (type) {
-	case AARCH64_INSN_BRANCH_COMP_ZERO:
-		insn = aarch64_insn_get_cbz_value();
-		break;
-	case AARCH64_INSN_BRANCH_COMP_NONZERO:
-		insn = aarch64_insn_get_cbnz_value();
-		break;
-	default:
-		pr_err("%s: unknown branch encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
-					     offset >> 2);
-}
-
-u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
-				     enum aarch64_insn_condition cond)
-{
-	u32 insn;
-	long offset;
-
-	offset = branch_imm_common(pc, addr, SZ_1M);
-
-	insn = aarch64_insn_get_bcond_value();
-
-	if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) {
-		pr_err("%s: unknown condition encoding %d\n", __func__, cond);
-		return AARCH64_BREAK_FAULT;
-	}
-	insn |= cond;
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
-					     offset >> 2);
-}
-
-u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op)
-{
-	return aarch64_insn_get_hint_value() | op;
-}
-
-u32 __kprobes aarch64_insn_gen_nop(void)
-{
-	return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
-}
-
-u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
-				enum aarch64_insn_branch_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_BRANCH_NOLINK:
-		insn = aarch64_insn_get_br_value();
-		break;
-	case AARCH64_INSN_BRANCH_LINK:
-		insn = aarch64_insn_get_blr_value();
-		break;
-	case AARCH64_INSN_BRANCH_RETURN:
-		insn = aarch64_insn_get_ret_value();
-		break;
-	default:
-		pr_err("%s: unknown branch encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
-}
-
-u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
-				    enum aarch64_insn_register base,
-				    enum aarch64_insn_register offset,
-				    enum aarch64_insn_size_type size,
-				    enum aarch64_insn_ldst_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_LDST_LOAD_REG_OFFSET:
-		insn = aarch64_insn_get_ldr_reg_value();
-		break;
-	case AARCH64_INSN_LDST_STORE_REG_OFFSET:
-		insn = aarch64_insn_get_str_reg_value();
-		break;
-	default:
-		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_ldst_size(size, insn);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
-					    base);
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
-					    offset);
-}
-
-u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
-				     enum aarch64_insn_register reg2,
-				     enum aarch64_insn_register base,
-				     int offset,
-				     enum aarch64_insn_variant variant,
-				     enum aarch64_insn_ldst_type type)
-{
-	u32 insn;
-	int shift;
-
-	switch (type) {
-	case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX:
-		insn = aarch64_insn_get_ldp_pre_value();
-		break;
-	case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX:
-		insn = aarch64_insn_get_stp_pre_value();
-		break;
-	case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX:
-		insn = aarch64_insn_get_ldp_post_value();
-		break;
-	case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX:
-		insn = aarch64_insn_get_stp_post_value();
-		break;
-	default:
-		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		if ((offset & 0x3) || (offset < -256) || (offset > 252)) {
-			pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n",
-			       __func__, offset);
-			return AARCH64_BREAK_FAULT;
-		}
-		shift = 2;
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		if ((offset & 0x7) || (offset < -512) || (offset > 504)) {
-			pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n",
-			       __func__, offset);
-			return AARCH64_BREAK_FAULT;
-		}
-		shift = 3;
-		insn |= AARCH64_INSN_SF_BIT;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
-					    reg1);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
-					    reg2);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
-					    base);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn,
-					     offset >> shift);
-}
-
-u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
-				   enum aarch64_insn_register base,
-				   enum aarch64_insn_register state,
-				   enum aarch64_insn_size_type size,
-				   enum aarch64_insn_ldst_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_LDST_LOAD_EX:
-		insn = aarch64_insn_get_load_ex_value();
-		break;
-	case AARCH64_INSN_LDST_STORE_EX:
-		insn = aarch64_insn_get_store_ex_value();
-		break;
-	default:
-		pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_ldst_size(size, insn);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
-					    reg);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
-					    base);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
-					    AARCH64_INSN_REG_ZR);
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
-					    state);
-}
-
-u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
-			   enum aarch64_insn_register address,
-			   enum aarch64_insn_register value,
-			   enum aarch64_insn_size_type size)
-{
-	u32 insn = aarch64_insn_get_ldadd_value();
-
-	switch (size) {
-	case AARCH64_INSN_SIZE_32:
-	case AARCH64_INSN_SIZE_64:
-		break;
-	default:
-		pr_err("%s: unimplemented size encoding %d\n", __func__, size);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_ldst_size(size, insn);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
-					    result);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
-					    address);
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
-					    value);
-}
-
-u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
-			   enum aarch64_insn_register value,
-			   enum aarch64_insn_size_type size)
-{
-	/*
-	 * STADD is simply encoded as an alias for LDADD with XZR as
-	 * the destination register.
-	 */
-	return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
-				      value, size);
-}
-
-static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
-					enum aarch64_insn_prfm_target target,
-					enum aarch64_insn_prfm_policy policy,
-					u32 insn)
-{
-	u32 imm_type = 0, imm_target = 0, imm_policy = 0;
-
-	switch (type) {
-	case AARCH64_INSN_PRFM_TYPE_PLD:
-		break;
-	case AARCH64_INSN_PRFM_TYPE_PLI:
-		imm_type = BIT(0);
-		break;
-	case AARCH64_INSN_PRFM_TYPE_PST:
-		imm_type = BIT(1);
-		break;
-	default:
-		pr_err("%s: unknown prfm type encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (target) {
-	case AARCH64_INSN_PRFM_TARGET_L1:
-		break;
-	case AARCH64_INSN_PRFM_TARGET_L2:
-		imm_target = BIT(0);
-		break;
-	case AARCH64_INSN_PRFM_TARGET_L3:
-		imm_target = BIT(1);
-		break;
-	default:
-		pr_err("%s: unknown prfm target encoding %d\n", __func__, target);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (policy) {
-	case AARCH64_INSN_PRFM_POLICY_KEEP:
-		break;
-	case AARCH64_INSN_PRFM_POLICY_STRM:
-		imm_policy = BIT(0);
-		break;
-	default:
-		pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	/* In this case, imm5 is encoded into Rt field. */
-	insn &= ~GENMASK(4, 0);
-	insn |= imm_policy | (imm_target << 1) | (imm_type << 3);
-
-	return insn;
-}
-
-u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
-			      enum aarch64_insn_prfm_type type,
-			      enum aarch64_insn_prfm_target target,
-			      enum aarch64_insn_prfm_policy policy)
-{
-	u32 insn = aarch64_insn_get_prfm_value();
-
-	insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn);
-
-	insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
-					    base);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0);
-}
-
-u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
-				 enum aarch64_insn_register src,
-				 int imm, enum aarch64_insn_variant variant,
-				 enum aarch64_insn_adsb_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_ADSB_ADD:
-		insn = aarch64_insn_get_add_imm_value();
-		break;
-	case AARCH64_INSN_ADSB_SUB:
-		insn = aarch64_insn_get_sub_imm_value();
-		break;
-	case AARCH64_INSN_ADSB_ADD_SETFLAGS:
-		insn = aarch64_insn_get_adds_imm_value();
-		break;
-	case AARCH64_INSN_ADSB_SUB_SETFLAGS:
-		insn = aarch64_insn_get_subs_imm_value();
-		break;
-	default:
-		pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	/* We can't encode more than a 24bit value (12bit + 12bit shift) */
-	if (imm & ~(BIT(24) - 1))
-		goto out;
-
-	/* If we have something in the top 12 bits... */
-	if (imm & ~(SZ_4K - 1)) {
-		/* ... and in the low 12 bits -> error */
-		if (imm & (SZ_4K - 1))
-			goto out;
-
-		imm >>= 12;
-		insn |= AARCH64_INSN_LSL_12;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
-
-out:
-	pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
-	return AARCH64_BREAK_FAULT;
-}
-
-u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
-			      enum aarch64_insn_register src,
-			      int immr, int imms,
-			      enum aarch64_insn_variant variant,
-			      enum aarch64_insn_bitfield_type type)
-{
-	u32 insn;
-	u32 mask;
-
-	switch (type) {
-	case AARCH64_INSN_BITFIELD_MOVE:
-		insn = aarch64_insn_get_bfm_value();
-		break;
-	case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED:
-		insn = aarch64_insn_get_ubfm_value();
-		break;
-	case AARCH64_INSN_BITFIELD_MOVE_SIGNED:
-		insn = aarch64_insn_get_sbfm_value();
-		break;
-	default:
-		pr_err("%s: unknown bitfield encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		mask = GENMASK(4, 0);
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT;
-		mask = GENMASK(5, 0);
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	if (immr & ~mask) {
-		pr_err("%s: invalid immr encoding %d\n", __func__, immr);
-		return AARCH64_BREAK_FAULT;
-	}
-	if (imms & ~mask) {
-		pr_err("%s: invalid imms encoding %d\n", __func__, imms);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
-
-	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
-}
-
-u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
-			      int imm, int shift,
-			      enum aarch64_insn_variant variant,
-			      enum aarch64_insn_movewide_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_MOVEWIDE_ZERO:
-		insn = aarch64_insn_get_movz_value();
-		break;
-	case AARCH64_INSN_MOVEWIDE_KEEP:
-		insn = aarch64_insn_get_movk_value();
-		break;
-	case AARCH64_INSN_MOVEWIDE_INVERSE:
-		insn = aarch64_insn_get_movn_value();
-		break;
-	default:
-		pr_err("%s: unknown movewide encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	if (imm & ~(SZ_64K - 1)) {
-		pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		if (shift != 0 && shift != 16) {
-			pr_err("%s: invalid shift encoding %d\n", __func__,
-			       shift);
-			return AARCH64_BREAK_FAULT;
-		}
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		if (shift != 0 && shift != 16 && shift != 32 && shift != 48) {
-			pr_err("%s: invalid shift encoding %d\n", __func__,
-			       shift);
-			return AARCH64_BREAK_FAULT;
-		}
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn |= (shift >> 4) << 21;
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
-}
-
-u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
-					 enum aarch64_insn_register src,
-					 enum aarch64_insn_register reg,
-					 int shift,
-					 enum aarch64_insn_variant variant,
-					 enum aarch64_insn_adsb_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_ADSB_ADD:
-		insn = aarch64_insn_get_add_value();
-		break;
-	case AARCH64_INSN_ADSB_SUB:
-		insn = aarch64_insn_get_sub_value();
-		break;
-	case AARCH64_INSN_ADSB_ADD_SETFLAGS:
-		insn = aarch64_insn_get_adds_value();
-		break;
-	case AARCH64_INSN_ADSB_SUB_SETFLAGS:
-		insn = aarch64_insn_get_subs_value();
-		break;
-	default:
-		pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		if (shift & ~(SZ_32 - 1)) {
-			pr_err("%s: invalid shift encoding %d\n", __func__,
-			       shift);
-			return AARCH64_BREAK_FAULT;
-		}
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		if (shift & ~(SZ_64 - 1)) {
-			pr_err("%s: invalid shift encoding %d\n", __func__,
-			       shift);
-			return AARCH64_BREAK_FAULT;
-		}
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
-}
-
-u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
-			   enum aarch64_insn_register src,
-			   enum aarch64_insn_variant variant,
-			   enum aarch64_insn_data1_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_DATA1_REVERSE_16:
-		insn = aarch64_insn_get_rev16_value();
-		break;
-	case AARCH64_INSN_DATA1_REVERSE_32:
-		insn = aarch64_insn_get_rev32_value();
-		break;
-	case AARCH64_INSN_DATA1_REVERSE_64:
-		if (variant != AARCH64_INSN_VARIANT_64BIT) {
-			pr_err("%s: invalid variant for reverse64 %d\n",
-			       __func__, variant);
-			return AARCH64_BREAK_FAULT;
-		}
-		insn = aarch64_insn_get_rev64_value();
-		break;
-	default:
-		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
-}
-
-u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
-			   enum aarch64_insn_register src,
-			   enum aarch64_insn_register reg,
-			   enum aarch64_insn_variant variant,
-			   enum aarch64_insn_data2_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_DATA2_UDIV:
-		insn = aarch64_insn_get_udiv_value();
-		break;
-	case AARCH64_INSN_DATA2_SDIV:
-		insn = aarch64_insn_get_sdiv_value();
-		break;
-	case AARCH64_INSN_DATA2_LSLV:
-		insn = aarch64_insn_get_lslv_value();
-		break;
-	case AARCH64_INSN_DATA2_LSRV:
-		insn = aarch64_insn_get_lsrv_value();
-		break;
-	case AARCH64_INSN_DATA2_ASRV:
-		insn = aarch64_insn_get_asrv_value();
-		break;
-	case AARCH64_INSN_DATA2_RORV:
-		insn = aarch64_insn_get_rorv_value();
-		break;
-	default:
-		pr_err("%s: unknown data2 encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
-}
-
-u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
-			   enum aarch64_insn_register src,
-			   enum aarch64_insn_register reg1,
-			   enum aarch64_insn_register reg2,
-			   enum aarch64_insn_variant variant,
-			   enum aarch64_insn_data3_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_DATA3_MADD:
-		insn = aarch64_insn_get_madd_value();
-		break;
-	case AARCH64_INSN_DATA3_MSUB:
-		insn = aarch64_insn_get_msub_value();
-		break;
-	default:
-		pr_err("%s: unknown data3 encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
-					    reg1);
-
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
-					    reg2);
-}
-
-u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
-					 enum aarch64_insn_register src,
-					 enum aarch64_insn_register reg,
-					 int shift,
-					 enum aarch64_insn_variant variant,
-					 enum aarch64_insn_logic_type type)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_LOGIC_AND:
-		insn = aarch64_insn_get_and_value();
-		break;
-	case AARCH64_INSN_LOGIC_BIC:
-		insn = aarch64_insn_get_bic_value();
-		break;
-	case AARCH64_INSN_LOGIC_ORR:
-		insn = aarch64_insn_get_orr_value();
-		break;
-	case AARCH64_INSN_LOGIC_ORN:
-		insn = aarch64_insn_get_orn_value();
-		break;
-	case AARCH64_INSN_LOGIC_EOR:
-		insn = aarch64_insn_get_eor_value();
-		break;
-	case AARCH64_INSN_LOGIC_EON:
-		insn = aarch64_insn_get_eon_value();
-		break;
-	case AARCH64_INSN_LOGIC_AND_SETFLAGS:
-		insn = aarch64_insn_get_ands_value();
-		break;
-	case AARCH64_INSN_LOGIC_BIC_SETFLAGS:
-		insn = aarch64_insn_get_bics_value();
-		break;
-	default:
-		pr_err("%s: unknown logical encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		if (shift & ~(SZ_32 - 1)) {
-			pr_err("%s: invalid shift encoding %d\n", __func__,
-			       shift);
-			return AARCH64_BREAK_FAULT;
-		}
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		if (shift & ~(SZ_64 - 1)) {
-			pr_err("%s: invalid shift encoding %d\n", __func__,
-			       shift);
-			return AARCH64_BREAK_FAULT;
-		}
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
-}
-
-/*
- * MOV (register) is architecturally an alias of ORR (shifted register) where
- * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
- */
-u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
-			      enum aarch64_insn_register src,
-			      enum aarch64_insn_variant variant)
-{
-	return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
-						    src, 0, variant,
-						    AARCH64_INSN_LOGIC_ORR);
-}
-
-u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
-			 enum aarch64_insn_register reg,
-			 enum aarch64_insn_adr_type type)
-{
-	u32 insn;
-	s32 offset;
-
-	switch (type) {
-	case AARCH64_INSN_ADR_TYPE_ADR:
-		insn = aarch64_insn_get_adr_value();
-		offset = addr - pc;
-		break;
-	case AARCH64_INSN_ADR_TYPE_ADRP:
-		insn = aarch64_insn_get_adrp_value();
-		offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
-		break;
-	default:
-		pr_err("%s: unknown adr encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	if (offset < -SZ_1M || offset >= SZ_1M)
-		return AARCH64_BREAK_FAULT;
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);
-
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
-}
-
-/*
- * Decode the imm field of a branch, and return the byte offset as a
- * signed value (so it can be used when computing a new branch
- * target).
- */
-s32 aarch64_get_branch_offset(u32 insn)
-{
-	s32 imm;
-
-	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
-		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
-		return (imm << 6) >> 4;
-	}
-
-	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
-	    aarch64_insn_is_bcond(insn)) {
-		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
-		return (imm << 13) >> 11;
-	}
-
-	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
-		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
-		return (imm << 18) >> 16;
-	}
-
-	/* Unhandled instruction */
-	BUG();
-}
-
-/*
- * Encode the displacement of a branch in the imm field and return the
- * updated instruction.
- */
-u32 aarch64_set_branch_offset(u32 insn, s32 offset)
-{
-	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
-		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
-						     offset >> 2);
-
-	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
-	    aarch64_insn_is_bcond(insn))
-		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
-						     offset >> 2);
-
-	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
-		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
-						     offset >> 2);
-
-	/* Unhandled instruction */
-	BUG();
-}
-
-s32 aarch64_insn_adrp_get_offset(u32 insn)
-{
-	BUG_ON(!aarch64_insn_is_adrp(insn));
-	return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
-}
-
-u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset)
-{
-	BUG_ON(!aarch64_insn_is_adrp(insn));
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn,
-						offset >> 12);
-}
-
-/*
- * Extract the Op/CR data from a msr/mrs instruction.
- */
-u32 aarch64_insn_extract_system_reg(u32 insn)
-{
-	return (insn & 0x1FFFE0) >> 5;
-}
-
-bool aarch32_insn_is_wide(u32 insn)
-{
-	return insn >= 0xe800;
-}
-
-/*
- * Macros/defines for extracting register numbers from instruction.
- */
-u32 aarch32_insn_extract_reg_num(u32 insn, int offset)
-{
-	return (insn & (0xf << offset)) >> offset;
-}
-
-#define OPC2_MASK	0x7
-#define OPC2_OFFSET	5
-u32 aarch32_insn_mcr_extract_opc2(u32 insn)
-{
-	return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET;
-}
-
-#define CRM_MASK	0xf
-u32 aarch32_insn_mcr_extract_crm(u32 insn)
-{
-	return insn & CRM_MASK;
-}
-
-static bool range_of_ones(u64 val)
-{
-	/* Doesn't handle full ones or full zeroes */
-	u64 sval = val >> __ffs64(val);
-
-	/* One of Sean Eron Anderson's bithack tricks */
-	return ((sval + 1) & (sval)) == 0;
-}
-
-static u32 aarch64_encode_immediate(u64 imm,
-				    enum aarch64_insn_variant variant,
-				    u32 insn)
-{
-	unsigned int immr, imms, n, ones, ror, esz, tmp;
-	u64 mask;
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		esz = 32;
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		insn |= AARCH64_INSN_SF_BIT;
-		esz = 64;
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	mask = GENMASK(esz - 1, 0);
-
-	/* Can't encode full zeroes, full ones, or value wider than the mask */
-	if (!imm || imm == mask || imm & ~mask)
-		return AARCH64_BREAK_FAULT;
-
-	/*
-	 * Inverse of Replicate(). Try to spot a repeating pattern
-	 * with a pow2 stride.
-	 */
-	for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
-		u64 emask = BIT(tmp) - 1;
-
-		if ((imm & emask) != ((imm >> tmp) & emask))
-			break;
-
-		esz = tmp;
-		mask = emask;
-	}
-
-	/* N is only set if we're encoding a 64bit value */
-	n = esz == 64;
-
-	/* Trim imm to the element size */
-	imm &= mask;
-
-	/* That's how many ones we need to encode */
-	ones = hweight64(imm);
-
-	/*
-	 * imms is set to (ones - 1), prefixed with a string of ones
-	 * and a zero if they fit. Cap it to 6 bits.
-	 */
-	imms  = ones - 1;
-	imms |= 0xf << ffs(esz);
-	imms &= BIT(6) - 1;
-
-	/* Compute the rotation */
-	if (range_of_ones(imm)) {
-		/*
-		 * Pattern: 0..01..10..0
-		 *
-		 * Compute how many rotate we need to align it right
-		 */
-		ror = __ffs64(imm);
-	} else {
-		/*
-		 * Pattern: 0..01..10..01..1
-		 *
-		 * Fill the unused top bits with ones, and check if
-		 * the result is a valid immediate (all ones with a
-		 * contiguous ranges of zeroes).
-		 */
-		imm |= ~mask;
-		if (!range_of_ones(~imm))
-			return AARCH64_BREAK_FAULT;
-
-		/*
-		 * Compute the rotation to get a continuous set of
-		 * ones, with the first bit set at position 0
-		 */
-		ror = fls(~imm);
-	}
-
-	/*
-	 * immr is the number of bits we need to rotate back to the
-	 * original set of ones. Note that this is relative to the
-	 * element size...
-	 */
-	immr = (esz - ror) % esz;
-
-	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
-	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
-	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
-}
-
-u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
-				       enum aarch64_insn_variant variant,
-				       enum aarch64_insn_register Rn,
-				       enum aarch64_insn_register Rd,
-				       u64 imm)
-{
-	u32 insn;
-
-	switch (type) {
-	case AARCH64_INSN_LOGIC_AND:
-		insn = aarch64_insn_get_and_imm_value();
-		break;
-	case AARCH64_INSN_LOGIC_ORR:
-		insn = aarch64_insn_get_orr_imm_value();
-		break;
-	case AARCH64_INSN_LOGIC_EOR:
-		insn = aarch64_insn_get_eor_imm_value();
-		break;
-	case AARCH64_INSN_LOGIC_AND_SETFLAGS:
-		insn = aarch64_insn_get_ands_imm_value();
-		break;
-	default:
-		pr_err("%s: unknown logical encoding %d\n", __func__, type);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
-	return aarch64_encode_immediate(imm, variant, insn);
-}
-
-u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
-			  enum aarch64_insn_register Rm,
-			  enum aarch64_insn_register Rn,
-			  enum aarch64_insn_register Rd,
-			  u8 lsb)
-{
-	u32 insn;
-
-	insn = aarch64_insn_get_extr_value();
-
-	switch (variant) {
-	case AARCH64_INSN_VARIANT_32BIT:
-		if (lsb > 31)
-			return AARCH64_BREAK_FAULT;
-		break;
-	case AARCH64_INSN_VARIANT_64BIT:
-		if (lsb > 63)
-			return AARCH64_BREAK_FAULT;
-		insn |= AARCH64_INSN_SF_BIT;
-		insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
-		break;
-	default:
-		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
-		return AARCH64_BREAK_FAULT;
-	}
-
-	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
-	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
-	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
-}
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..9cd83908717d 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
-		   clear_page.o csum.o memchr.o memcpy.o memmove.o	\
-		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
-		   strnlen.o strchr.o strrchr.o tishift.o
+		   clear_page.o csum.o insn.o memchr.o memcpy.o		\
+		   memmove.o memset.o memcmp.o strcmp.o strncmp.o	\
+		   strlen.o strnlen.o strchr.o strrchr.o tishift.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
new file mode 100644
index 000000000000..6ff8826ae7ea
--- /dev/null
+++ b/arch/arm64/lib/insn.c
@@ -0,0 +1,1458 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/errno.h>
+#include <asm/insn.h>
+#include <asm/kprobes.h>
+
+#define AARCH64_INSN_SF_BIT	BIT(31)
+#define AARCH64_INSN_N_BIT	BIT(22)
+#define AARCH64_INSN_LSL_12	BIT(22)
+
+static const int aarch64_insn_encoding_class[] = {
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+};
+
+enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn)
+{
+	return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
+}
+
+bool __kprobes aarch64_insn_is_steppable_hint(u32 insn)
+{
+	if (!aarch64_insn_is_hint(insn))
+		return false;
+
+	switch (insn & 0xFE0) {
+	case AARCH64_INSN_HINT_XPACLRI:
+	case AARCH64_INSN_HINT_PACIA_1716:
+	case AARCH64_INSN_HINT_PACIB_1716:
+	case AARCH64_INSN_HINT_PACIAZ:
+	case AARCH64_INSN_HINT_PACIASP:
+	case AARCH64_INSN_HINT_PACIBZ:
+	case AARCH64_INSN_HINT_PACIBSP:
+	case AARCH64_INSN_HINT_BTI:
+	case AARCH64_INSN_HINT_BTIC:
+	case AARCH64_INSN_HINT_BTIJ:
+	case AARCH64_INSN_HINT_BTIJC:
+	case AARCH64_INSN_HINT_NOP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+bool aarch64_insn_is_branch_imm(u32 insn)
+{
+	return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_bcond(insn));
+}
+
+bool __kprobes aarch64_insn_uses_literal(u32 insn)
+{
+	/* ldr/ldrsw (literal), prfm */
+
+	return aarch64_insn_is_ldr_lit(insn) ||
+		aarch64_insn_is_ldrsw_lit(insn) ||
+		aarch64_insn_is_adr_adrp(insn) ||
+		aarch64_insn_is_prfm_lit(insn);
+}
+
+bool __kprobes aarch64_insn_is_branch(u32 insn)
+{
+	/* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */
+
+	return aarch64_insn_is_b(insn) ||
+		aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_cbz(insn) ||
+		aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_tbz(insn) ||
+		aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_ret(insn) ||
+		aarch64_insn_is_ret_auth(insn) ||
+		aarch64_insn_is_br(insn) ||
+		aarch64_insn_is_br_auth(insn) ||
+		aarch64_insn_is_blr(insn) ||
+		aarch64_insn_is_blr_auth(insn) ||
+		aarch64_insn_is_bcond(insn);
+}
+
+static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
+						u32 *maskp, int *shiftp)
+{
+	u32 mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_26:
+		mask = BIT(26) - 1;
+		shift = 0;
+		break;
+	case AARCH64_INSN_IMM_19:
+		mask = BIT(19) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_16:
+		mask = BIT(16) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_14:
+		mask = BIT(14) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_12:
+		mask = BIT(12) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_9:
+		mask = BIT(9) - 1;
+		shift = 12;
+		break;
+	case AARCH64_INSN_IMM_7:
+		mask = BIT(7) - 1;
+		shift = 15;
+		break;
+	case AARCH64_INSN_IMM_6:
+	case AARCH64_INSN_IMM_S:
+		mask = BIT(6) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_R:
+		mask = BIT(6) - 1;
+		shift = 16;
+		break;
+	case AARCH64_INSN_IMM_N:
+		mask = 1;
+		shift = 22;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*maskp = mask;
+	*shiftp = shift;
+
+	return 0;
+}
+
+#define ADR_IMM_HILOSPLIT	2
+#define ADR_IMM_SIZE		SZ_2M
+#define ADR_IMM_LOMASK		((1 << ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_HIMASK		((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_LOSHIFT		29
+#define ADR_IMM_HISHIFT		5
+
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
+{
+	u32 immlo, immhi, mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_ADR:
+		shift = 0;
+		immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
+		immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
+		insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
+		mask = ADR_IMM_SIZE - 1;
+		break;
+	default:
+		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+			pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n",
+			       type);
+			return 0;
+		}
+	}
+
+	return (insn >> shift) & mask;
+}
+
+u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+				  u32 insn, u64 imm)
+{
+	u32 immlo, immhi, mask;
+	int shift;
+
+	if (insn == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_ADR:
+		shift = 0;
+		immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
+		imm >>= ADR_IMM_HILOSPLIT;
+		immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
+		imm = immlo | immhi;
+		mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
+			(ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
+		break;
+	default:
+		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+			pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
+			       type);
+			return AARCH64_BREAK_FAULT;
+		}
+	}
+
+	/* Update the immediate field. */
+	insn &= ~(mask << shift);
+	insn |= (imm & mask) << shift;
+
+	return insn;
+}
+
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+					u32 insn)
+{
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_REGTYPE_RT:
+	case AARCH64_INSN_REGTYPE_RD:
+		shift = 0;
+		break;
+	case AARCH64_INSN_REGTYPE_RN:
+		shift = 5;
+		break;
+	case AARCH64_INSN_REGTYPE_RT2:
+	case AARCH64_INSN_REGTYPE_RA:
+		shift = 10;
+		break;
+	case AARCH64_INSN_REGTYPE_RM:
+		shift = 16;
+		break;
+	default:
+		pr_err("%s: unknown register type encoding %d\n", __func__,
+		       type);
+		return 0;
+	}
+
+	return (insn >> shift) & GENMASK(4, 0);
+}
+
+static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
+					u32 insn,
+					enum aarch64_insn_register reg)
+{
+	int shift;
+
+	if (insn == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
+
+	if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
+		pr_err("%s: unknown register encoding %d\n", __func__, reg);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (type) {
+	case AARCH64_INSN_REGTYPE_RT:
+	case AARCH64_INSN_REGTYPE_RD:
+		shift = 0;
+		break;
+	case AARCH64_INSN_REGTYPE_RN:
+		shift = 5;
+		break;
+	case AARCH64_INSN_REGTYPE_RT2:
+	case AARCH64_INSN_REGTYPE_RA:
+		shift = 10;
+		break;
+	case AARCH64_INSN_REGTYPE_RM:
+	case AARCH64_INSN_REGTYPE_RS:
+		shift = 16;
+		break;
+	default:
+		pr_err("%s: unknown register type encoding %d\n", __func__,
+		       type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn &= ~(GENMASK(4, 0) << shift);
+	insn |= reg << shift;
+
+	return insn;
+}
+
+static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
+					 u32 insn)
+{
+	u32 size;
+
+	switch (type) {
+	case AARCH64_INSN_SIZE_8:
+		size = 0;
+		break;
+	case AARCH64_INSN_SIZE_16:
+		size = 1;
+		break;
+	case AARCH64_INSN_SIZE_32:
+		size = 2;
+		break;
+	case AARCH64_INSN_SIZE_64:
+		size = 3;
+		break;
+	default:
+		pr_err("%s: unknown size encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn &= ~GENMASK(31, 30);
+	insn |= size << 30;
+
+	return insn;
+}
+
+static inline long branch_imm_common(unsigned long pc, unsigned long addr,
+				     long range)
+{
+	long offset;
+
+	if ((pc & 0x3) || (addr & 0x3)) {
+		pr_err("%s: A64 instructions must be word aligned\n", __func__);
+		return range;
+	}
+
+	offset = ((long)addr - (long)pc);
+
+	if (offset < -range || offset >= range) {
+		pr_err("%s: offset out of range\n", __func__);
+		return range;
+	}
+
+	return offset;
+}
+
+u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
+					  enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+	long offset;
+
+	/*
+	 * B/BL support [-128M, 128M) offset
+	 * ARM64 virtual address arrangement guarantees all kernel and module
+	 * texts are within +/-128M.
+	 */
+	offset = branch_imm_common(pc, addr, SZ_128M);
+	if (offset >= SZ_128M)
+		return AARCH64_BREAK_FAULT;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_LINK:
+		insn = aarch64_insn_get_bl_value();
+		break;
+	case AARCH64_INSN_BRANCH_NOLINK:
+		insn = aarch64_insn_get_b_value();
+		break;
+	default:
+		pr_err("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+					     offset >> 2);
+}
+
+u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
+				     enum aarch64_insn_register reg,
+				     enum aarch64_insn_variant variant,
+				     enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+	long offset;
+
+	offset = branch_imm_common(pc, addr, SZ_1M);
+	if (offset >= SZ_1M)
+		return AARCH64_BREAK_FAULT;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_COMP_ZERO:
+		insn = aarch64_insn_get_cbz_value();
+		break;
+	case AARCH64_INSN_BRANCH_COMP_NONZERO:
+		insn = aarch64_insn_get_cbnz_value();
+		break;
+	default:
+		pr_err("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+					     offset >> 2);
+}
+
+u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
+				     enum aarch64_insn_condition cond)
+{
+	u32 insn;
+	long offset;
+
+	offset = branch_imm_common(pc, addr, SZ_1M);
+
+	insn = aarch64_insn_get_bcond_value();
+
+	if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) {
+		pr_err("%s: unknown condition encoding %d\n", __func__, cond);
+		return AARCH64_BREAK_FAULT;
+	}
+	insn |= cond;
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+					     offset >> 2);
+}
+
+u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op)
+{
+	return aarch64_insn_get_hint_value() | op;
+}
+
+u32 __kprobes aarch64_insn_gen_nop(void)
+{
+	return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
+}
+
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+				enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_NOLINK:
+		insn = aarch64_insn_get_br_value();
+		break;
+	case AARCH64_INSN_BRANCH_LINK:
+		insn = aarch64_insn_get_blr_value();
+		break;
+	case AARCH64_INSN_BRANCH_RETURN:
+		insn = aarch64_insn_get_ret_value();
+		break;
+	default:
+		pr_err("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
+}
+
+u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
+				    enum aarch64_insn_register base,
+				    enum aarch64_insn_register offset,
+				    enum aarch64_insn_size_type size,
+				    enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_REG_OFFSET:
+		insn = aarch64_insn_get_ldr_reg_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_REG_OFFSET:
+		insn = aarch64_insn_get_str_reg_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+					    offset);
+}
+
+u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
+				     enum aarch64_insn_register reg2,
+				     enum aarch64_insn_register base,
+				     int offset,
+				     enum aarch64_insn_variant variant,
+				     enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX:
+		insn = aarch64_insn_get_ldp_pre_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX:
+		insn = aarch64_insn_get_stp_pre_value();
+		break;
+	case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX:
+		insn = aarch64_insn_get_ldp_post_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX:
+		insn = aarch64_insn_get_stp_post_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if ((offset & 0x3) || (offset < -256) || (offset > 252)) {
+			pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n",
+			       __func__, offset);
+			return AARCH64_BREAK_FAULT;
+		}
+		shift = 2;
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		if ((offset & 0x7) || (offset < -512) || (offset > 504)) {
+			pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n",
+			       __func__, offset);
+			return AARCH64_BREAK_FAULT;
+		}
+		shift = 3;
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    reg1);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+					    reg2);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn,
+					     offset >> shift);
+}
+
+u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
+				   enum aarch64_insn_register base,
+				   enum aarch64_insn_register state,
+				   enum aarch64_insn_size_type size,
+				   enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_EX:
+		insn = aarch64_insn_get_load_ex_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_EX:
+		insn = aarch64_insn_get_store_ex_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    reg);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+					    AARCH64_INSN_REG_ZR);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+					    state);
+}
+
+u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
+			   enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size)
+{
+	u32 insn = aarch64_insn_get_ldadd_value();
+
+	switch (size) {
+	case AARCH64_INSN_SIZE_32:
+	case AARCH64_INSN_SIZE_64:
+		break;
+	default:
+		pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    result);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    address);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+					    value);
+}
+
+u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size)
+{
+	/*
+	 * STADD is simply encoded as an alias for LDADD with XZR as
+	 * the destination register.
+	 */
+	return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
+				      value, size);
+}
+
+static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
+					enum aarch64_insn_prfm_target target,
+					enum aarch64_insn_prfm_policy policy,
+					u32 insn)
+{
+	u32 imm_type = 0, imm_target = 0, imm_policy = 0;
+
+	switch (type) {
+	case AARCH64_INSN_PRFM_TYPE_PLD:
+		break;
+	case AARCH64_INSN_PRFM_TYPE_PLI:
+		imm_type = BIT(0);
+		break;
+	case AARCH64_INSN_PRFM_TYPE_PST:
+		imm_type = BIT(1);
+		break;
+	default:
+		pr_err("%s: unknown prfm type encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (target) {
+	case AARCH64_INSN_PRFM_TARGET_L1:
+		break;
+	case AARCH64_INSN_PRFM_TARGET_L2:
+		imm_target = BIT(0);
+		break;
+	case AARCH64_INSN_PRFM_TARGET_L3:
+		imm_target = BIT(1);
+		break;
+	default:
+		pr_err("%s: unknown prfm target encoding %d\n", __func__, target);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (policy) {
+	case AARCH64_INSN_PRFM_POLICY_KEEP:
+		break;
+	case AARCH64_INSN_PRFM_POLICY_STRM:
+		imm_policy = BIT(0);
+		break;
+	default:
+		pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	/* In this case, imm5 is encoded into Rt field. */
+	insn &= ~GENMASK(4, 0);
+	insn |= imm_policy | (imm_target << 1) | (imm_type << 3);
+
+	return insn;
+}
+
+u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
+			      enum aarch64_insn_prfm_type type,
+			      enum aarch64_insn_prfm_target target,
+			      enum aarch64_insn_prfm_policy policy)
+{
+	u32 insn = aarch64_insn_get_prfm_value();
+
+	insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn);
+
+	insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0);
+}
+
+u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
+				 enum aarch64_insn_register src,
+				 int imm, enum aarch64_insn_variant variant,
+				 enum aarch64_insn_adsb_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_ADSB_ADD:
+		insn = aarch64_insn_get_add_imm_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB:
+		insn = aarch64_insn_get_sub_imm_value();
+		break;
+	case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+		insn = aarch64_insn_get_adds_imm_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+		insn = aarch64_insn_get_subs_imm_value();
+		break;
+	default:
+		pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	/* We can't encode more than a 24bit value (12bit + 12bit shift) */
+	if (imm & ~(BIT(24) - 1))
+		goto out;
+
+	/* If we have something in the top 12 bits... */
+	if (imm & ~(SZ_4K - 1)) {
+		/* ... and in the low 12 bits -> error */
+		if (imm & (SZ_4K - 1))
+			goto out;
+
+		imm >>= 12;
+		insn |= AARCH64_INSN_LSL_12;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+
+out:
+	pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+	return AARCH64_BREAK_FAULT;
+}
+
+u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
+			      enum aarch64_insn_register src,
+			      int immr, int imms,
+			      enum aarch64_insn_variant variant,
+			      enum aarch64_insn_bitfield_type type)
+{
+	u32 insn;
+	u32 mask;
+
+	switch (type) {
+	case AARCH64_INSN_BITFIELD_MOVE:
+		insn = aarch64_insn_get_bfm_value();
+		break;
+	case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED:
+		insn = aarch64_insn_get_ubfm_value();
+		break;
+	case AARCH64_INSN_BITFIELD_MOVE_SIGNED:
+		insn = aarch64_insn_get_sbfm_value();
+		break;
+	default:
+		pr_err("%s: unknown bitfield encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		mask = GENMASK(4, 0);
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT;
+		mask = GENMASK(5, 0);
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (immr & ~mask) {
+		pr_err("%s: invalid immr encoding %d\n", __func__, immr);
+		return AARCH64_BREAK_FAULT;
+	}
+	if (imms & ~mask) {
+		pr_err("%s: invalid imms encoding %d\n", __func__, imms);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
+			      int imm, int shift,
+			      enum aarch64_insn_variant variant,
+			      enum aarch64_insn_movewide_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_MOVEWIDE_ZERO:
+		insn = aarch64_insn_get_movz_value();
+		break;
+	case AARCH64_INSN_MOVEWIDE_KEEP:
+		insn = aarch64_insn_get_movk_value();
+		break;
+	case AARCH64_INSN_MOVEWIDE_INVERSE:
+		insn = aarch64_insn_get_movn_value();
+		break;
+	default:
+		pr_err("%s: unknown movewide encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (imm & ~(SZ_64K - 1)) {
+		pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (shift != 0 && shift != 16) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		if (shift != 0 && shift != 16 && shift != 32 && shift != 48) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn |= (shift >> 4) << 21;
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
+}
+
+u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
+					 enum aarch64_insn_register src,
+					 enum aarch64_insn_register reg,
+					 int shift,
+					 enum aarch64_insn_variant variant,
+					 enum aarch64_insn_adsb_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_ADSB_ADD:
+		insn = aarch64_insn_get_add_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB:
+		insn = aarch64_insn_get_sub_value();
+		break;
+	case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+		insn = aarch64_insn_get_adds_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+		insn = aarch64_insn_get_subs_value();
+		break;
+	default:
+		pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (shift & ~(SZ_32 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		if (shift & ~(SZ_64 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data1_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_DATA1_REVERSE_16:
+		insn = aarch64_insn_get_rev16_value();
+		break;
+	case AARCH64_INSN_DATA1_REVERSE_32:
+		insn = aarch64_insn_get_rev32_value();
+		break;
+	case AARCH64_INSN_DATA1_REVERSE_64:
+		if (variant != AARCH64_INSN_VARIANT_64BIT) {
+			pr_err("%s: invalid variant for reverse64 %d\n",
+			       __func__, variant);
+			return AARCH64_BREAK_FAULT;
+		}
+		insn = aarch64_insn_get_rev64_value();
+		break;
+	default:
+		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+}
+
+u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_register reg,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data2_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_DATA2_UDIV:
+		insn = aarch64_insn_get_udiv_value();
+		break;
+	case AARCH64_INSN_DATA2_SDIV:
+		insn = aarch64_insn_get_sdiv_value();
+		break;
+	case AARCH64_INSN_DATA2_LSLV:
+		insn = aarch64_insn_get_lslv_value();
+		break;
+	case AARCH64_INSN_DATA2_LSRV:
+		insn = aarch64_insn_get_lsrv_value();
+		break;
+	case AARCH64_INSN_DATA2_ASRV:
+		insn = aarch64_insn_get_asrv_value();
+		break;
+	case AARCH64_INSN_DATA2_RORV:
+		insn = aarch64_insn_get_rorv_value();
+		break;
+	default:
+		pr_err("%s: unknown data2 encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+}
+
+u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_register reg1,
+			   enum aarch64_insn_register reg2,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data3_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_DATA3_MADD:
+		insn = aarch64_insn_get_madd_value();
+		break;
+	case AARCH64_INSN_DATA3_MSUB:
+		insn = aarch64_insn_get_msub_value();
+		break;
+	default:
+		pr_err("%s: unknown data3 encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    reg1);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+					    reg2);
+}
+
+u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
+					 enum aarch64_insn_register src,
+					 enum aarch64_insn_register reg,
+					 int shift,
+					 enum aarch64_insn_variant variant,
+					 enum aarch64_insn_logic_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LOGIC_AND:
+		insn = aarch64_insn_get_and_value();
+		break;
+	case AARCH64_INSN_LOGIC_BIC:
+		insn = aarch64_insn_get_bic_value();
+		break;
+	case AARCH64_INSN_LOGIC_ORR:
+		insn = aarch64_insn_get_orr_value();
+		break;
+	case AARCH64_INSN_LOGIC_ORN:
+		insn = aarch64_insn_get_orn_value();
+		break;
+	case AARCH64_INSN_LOGIC_EOR:
+		insn = aarch64_insn_get_eor_value();
+		break;
+	case AARCH64_INSN_LOGIC_EON:
+		insn = aarch64_insn_get_eon_value();
+		break;
+	case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+		insn = aarch64_insn_get_ands_value();
+		break;
+	case AARCH64_INSN_LOGIC_BIC_SETFLAGS:
+		insn = aarch64_insn_get_bics_value();
+		break;
+	default:
+		pr_err("%s: unknown logical encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (shift & ~(SZ_32 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		if (shift & ~(SZ_64 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+/*
+ * MOV (register) is architecturally an alias of ORR (shifted register) where
+ * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
+ */
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+			      enum aarch64_insn_register src,
+			      enum aarch64_insn_variant variant)
+{
+	return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
+						    src, 0, variant,
+						    AARCH64_INSN_LOGIC_ORR);
+}
+
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+			 enum aarch64_insn_register reg,
+			 enum aarch64_insn_adr_type type)
+{
+	u32 insn;
+	s32 offset;
+
+	switch (type) {
+	case AARCH64_INSN_ADR_TYPE_ADR:
+		insn = aarch64_insn_get_adr_value();
+		offset = addr - pc;
+		break;
+	case AARCH64_INSN_ADR_TYPE_ADRP:
+		insn = aarch64_insn_get_adrp_value();
+		offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
+		break;
+	default:
+		pr_err("%s: unknown adr encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (offset < -SZ_1M || offset >= SZ_1M)
+		return AARCH64_BREAK_FAULT;
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
+}
+
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+	s32 imm;
+
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+		return (imm << 6) >> 4;
+	}
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+		return (imm << 13) >> 11;
+	}
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+		return (imm << 18) >> 16;
+	}
+
+	/* Unhandled instruction */
+	BUG();
+}
+
+/*
+ * Encode the displacement of a branch in the imm field and return the
+ * updated instruction.
+ */
+u32 aarch64_set_branch_offset(u32 insn, s32 offset)
+{
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+						     offset >> 2);
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+						     offset >> 2);
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
+						     offset >> 2);
+
+	/* Unhandled instruction */
+	BUG();
+}
+
+s32 aarch64_insn_adrp_get_offset(u32 insn)
+{
+	BUG_ON(!aarch64_insn_is_adrp(insn));
+	return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
+}
+
+u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset)
+{
+	BUG_ON(!aarch64_insn_is_adrp(insn));
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn,
+						offset >> 12);
+}
+
+/*
+ * Extract the Op/CR data from a msr/mrs instruction.
+ */
+u32 aarch64_insn_extract_system_reg(u32 insn)
+{
+	return (insn & 0x1FFFE0) >> 5;
+}
+
+bool aarch32_insn_is_wide(u32 insn)
+{
+	return insn >= 0xe800;
+}
+
+/*
+ * Macros/defines for extracting register numbers from instruction.
+ */
+u32 aarch32_insn_extract_reg_num(u32 insn, int offset)
+{
+	return (insn & (0xf << offset)) >> offset;
+}
+
+#define OPC2_MASK	0x7
+#define OPC2_OFFSET	5
+u32 aarch32_insn_mcr_extract_opc2(u32 insn)
+{
+	return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET;
+}
+
+#define CRM_MASK	0xf
+u32 aarch32_insn_mcr_extract_crm(u32 insn)
+{
+	return insn & CRM_MASK;
+}
+
+static bool range_of_ones(u64 val)
+{
+	/* Doesn't handle full ones or full zeroes */
+	u64 sval = val >> __ffs64(val);
+
+	/* One of Sean Eron Anderson's bithack tricks */
+	return ((sval + 1) & (sval)) == 0;
+}
+
+static u32 aarch64_encode_immediate(u64 imm,
+				    enum aarch64_insn_variant variant,
+				    u32 insn)
+{
+	unsigned int immr, imms, n, ones, ror, esz, tmp;
+	u64 mask;
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		esz = 32;
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		esz = 64;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	mask = GENMASK(esz - 1, 0);
+
+	/* Can't encode full zeroes, full ones, or value wider than the mask */
+	if (!imm || imm == mask || imm & ~mask)
+		return AARCH64_BREAK_FAULT;
+
+	/*
+	 * Inverse of Replicate(). Try to spot a repeating pattern
+	 * with a pow2 stride.
+	 */
+	for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
+		u64 emask = BIT(tmp) - 1;
+
+		if ((imm & emask) != ((imm >> tmp) & emask))
+			break;
+
+		esz = tmp;
+		mask = emask;
+	}
+
+	/* N is only set if we're encoding a 64bit value */
+	n = esz == 64;
+
+	/* Trim imm to the element size */
+	imm &= mask;
+
+	/* That's how many ones we need to encode */
+	ones = hweight64(imm);
+
+	/*
+	 * imms is set to (ones - 1), prefixed with a string of ones
+	 * and a zero if they fit. Cap it to 6 bits.
+	 */
+	imms  = ones - 1;
+	imms |= 0xf << ffs(esz);
+	imms &= BIT(6) - 1;
+
+	/* Compute the rotation */
+	if (range_of_ones(imm)) {
+		/*
+		 * Pattern: 0..01..10..0
+		 *
+		 * Compute how many rotate we need to align it right
+		 */
+		ror = __ffs64(imm);
+	} else {
+		/*
+		 * Pattern: 0..01..10..01..1
+		 *
+		 * Fill the unused top bits with ones, and check if
+		 * the result is a valid immediate (all ones with a
+		 * contiguous ranges of zeroes).
+		 */
+		imm |= ~mask;
+		if (!range_of_ones(~imm))
+			return AARCH64_BREAK_FAULT;
+
+		/*
+		 * Compute the rotation to get a continuous set of
+		 * ones, with the first bit set at position 0
+		 */
+		ror = fls(~imm);
+	}
+
+	/*
+	 * immr is the number of bits we need to rotate back to the
+	 * original set of ones. Note that this is relative to the
+	 * element size...
+	 */
+	immr = (esz - ror) % esz;
+
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+				       enum aarch64_insn_variant variant,
+				       enum aarch64_insn_register Rn,
+				       enum aarch64_insn_register Rd,
+				       u64 imm)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LOGIC_AND:
+		insn = aarch64_insn_get_and_imm_value();
+		break;
+	case AARCH64_INSN_LOGIC_ORR:
+		insn = aarch64_insn_get_orr_imm_value();
+		break;
+	case AARCH64_INSN_LOGIC_EOR:
+		insn = aarch64_insn_get_eor_imm_value();
+		break;
+	case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+		insn = aarch64_insn_get_ands_imm_value();
+		break;
+	default:
+		pr_err("%s: unknown logical encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+	return aarch64_encode_immediate(imm, variant, insn);
+}
+
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+			  enum aarch64_insn_register Rm,
+			  enum aarch64_insn_register Rn,
+			  enum aarch64_insn_register Rd,
+			  u8 lsb)
+{
+	u32 insn;
+
+	insn = aarch64_insn_get_extr_value();
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (lsb > 31)
+			return AARCH64_BREAK_FAULT;
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		if (lsb > 63)
+			return AARCH64_BREAK_FAULT;
+		insn |= AARCH64_INSN_SF_BIT;
+		insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
+}
-- 
cgit v1.2.3


From 427bfc59e2281eaede70f050062dc31257c46652 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:33 +0100
Subject: arm64: insn: Add SVE instruction class

SVE has been public for some time now. Let the decoder acknowledge
its existence.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-6-jthierry@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h | 1 +
 arch/arm64/lib/insn.c         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 7adc4398fadb..93f7b0c86dfd 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -31,6 +31,7 @@
  */
 enum aarch64_insn_encoding_class {
 	AARCH64_INSN_CLS_UNKNOWN,	/* UNALLOCATED */
+	AARCH64_INSN_CLS_SVE,		/* SVE instructions */
 	AARCH64_INSN_CLS_DP_IMM,	/* Data processing - immediate */
 	AARCH64_INSN_CLS_DP_REG,	/* Data processing - register */
 	AARCH64_INSN_CLS_DP_FPSIMD,	/* Data processing - SIMD and FP */
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 6ff8826ae7ea..b506a4b1e38c 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -23,7 +23,7 @@
 static const int aarch64_insn_encoding_class[] = {
 	AARCH64_INSN_CLS_UNKNOWN,
 	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_SVE,
 	AARCH64_INSN_CLS_UNKNOWN,
 	AARCH64_INSN_CLS_LDST,
 	AARCH64_INSN_CLS_DP_REG,
-- 
cgit v1.2.3


From d4b217330d7e0320084ff04c8491964f1f68980a Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:34 +0100
Subject: arm64: insn: Add barrier encodings

Create necessary functions to encode/decode aarch64 barrier
instructions.

DSB needs special case handling as it has multiple encodings.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-7-jthierry@redhat.com
[will: Don't reject DSB #4]
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 93f7b0c86dfd..b8e2c6c46547 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -370,6 +370,14 @@ __AARCH64_INSN_FUNCS(eret_auth,	0xFFFFFBFF, 0xD69F0BFF)
 __AARCH64_INSN_FUNCS(mrs,	0xFFF00000, 0xD5300000)
 __AARCH64_INSN_FUNCS(msr_imm,	0xFFF8F01F, 0xD500401F)
 __AARCH64_INSN_FUNCS(msr_reg,	0xFFF00000, 0xD5100000)
+__AARCH64_INSN_FUNCS(dmb,	0xFFFFF0FF, 0xD50330BF)
+__AARCH64_INSN_FUNCS(dsb_base,	0xFFFFF0FF, 0xD503309F)
+__AARCH64_INSN_FUNCS(dsb_nxs,	0xFFFFF3FF, 0xD503323F)
+__AARCH64_INSN_FUNCS(isb,	0xFFFFF0FF, 0xD50330DF)
+__AARCH64_INSN_FUNCS(sb,	0xFFFFFFFF, 0xD50330FF)
+__AARCH64_INSN_FUNCS(clrex,	0xFFFFF0FF, 0xD503305F)
+__AARCH64_INSN_FUNCS(ssbb,	0xFFFFFFFF, 0xD503309F)
+__AARCH64_INSN_FUNCS(pssbb,	0xFFFFFFFF, 0xD503349F)
 
 #undef	__AARCH64_INSN_FUNCS
 
@@ -381,6 +389,19 @@ static inline bool aarch64_insn_is_adr_adrp(u32 insn)
 	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
 }
 
+static inline bool aarch64_insn_is_dsb(u32 insn)
+{
+	return aarch64_insn_is_dsb_base(insn) || aarch64_insn_is_dsb_nxs(insn);
+}
+
+static inline bool aarch64_insn_is_barrier(u32 insn)
+{
+	return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) ||
+	       aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) ||
+	       aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) ||
+	       aarch64_insn_is_pssbb(insn);
+}
+
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);
-- 
cgit v1.2.3


From 54880044c639f9c59346eabe637f9f8f39a112b8 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:35 +0100
Subject: arm64: insn: Add some opcodes to instruction decoder

Add decoding capability for some instructions that objtool will need
to decode.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-8-jthierry@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index b8e2c6c46547..ac8f47ff7b18 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -296,6 +296,12 @@ __AARCH64_INSN_FUNCS(adr,	0x9F000000, 0x10000000)
 __AARCH64_INSN_FUNCS(adrp,	0x9F000000, 0x90000000)
 __AARCH64_INSN_FUNCS(prfm,	0x3FC00000, 0x39800000)
 __AARCH64_INSN_FUNCS(prfm_lit,	0xFF000000, 0xD8000000)
+__AARCH64_INSN_FUNCS(store_imm,	0x3FC00000, 0x39000000)
+__AARCH64_INSN_FUNCS(load_imm,	0x3FC00000, 0x39400000)
+__AARCH64_INSN_FUNCS(store_pre,	0x3FE00C00, 0x38000C00)
+__AARCH64_INSN_FUNCS(load_pre,	0x3FE00C00, 0x38400C00)
+__AARCH64_INSN_FUNCS(store_post,	0x3FE00C00, 0x38000400)
+__AARCH64_INSN_FUNCS(load_post,	0x3FE00C00, 0x38400400)
 __AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
 __AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0x38200000)
 __AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
@@ -304,6 +310,8 @@ __AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
 __AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
 __AARCH64_INSN_FUNCS(load_ex,	0x3F400000, 0x08400000)
 __AARCH64_INSN_FUNCS(store_ex,	0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(stp,	0x7FC00000, 0x29000000)
+__AARCH64_INSN_FUNCS(ldp,	0x7FC00000, 0x29400000)
 __AARCH64_INSN_FUNCS(stp_post,	0x7FC00000, 0x28800000)
 __AARCH64_INSN_FUNCS(ldp_post,	0x7FC00000, 0x28C00000)
 __AARCH64_INSN_FUNCS(stp_pre,	0x7FC00000, 0x29800000)
@@ -336,6 +344,7 @@ __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
 __AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
 __AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
 __AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
+__AARCH64_INSN_FUNCS(mov_reg,	0x7FE0FFE0, 0x2A0003E0)
 __AARCH64_INSN_FUNCS(orn,	0x7F200000, 0x2A200000)
 __AARCH64_INSN_FUNCS(eor,	0x7F200000, 0x4A000000)
 __AARCH64_INSN_FUNCS(eon,	0x7F200000, 0x4A200000)
-- 
cgit v1.2.3


From 71766b81de8204a0fb56de3ad1972516bac99f5b Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 3 Mar 2021 18:05:36 +0100
Subject: arm64: insn: Add load/store decoding helpers

Provide some function to group different load/store instructions.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Link: https://lore.kernel.org/r/20210303170536.1838032-9-jthierry@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index ac8f47ff7b18..1ea9611545bb 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -411,6 +411,34 @@ static inline bool aarch64_insn_is_barrier(u32 insn)
 	       aarch64_insn_is_pssbb(insn);
 }
 
+static inline bool aarch64_insn_is_store_single(u32 insn)
+{
+	return aarch64_insn_is_store_imm(insn) ||
+	       aarch64_insn_is_store_pre(insn) ||
+	       aarch64_insn_is_store_post(insn);
+}
+
+static inline bool aarch64_insn_is_store_pair(u32 insn)
+{
+	return aarch64_insn_is_stp(insn) ||
+	       aarch64_insn_is_stp_pre(insn) ||
+	       aarch64_insn_is_stp_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_single(u32 insn)
+{
+	return aarch64_insn_is_load_imm(insn) ||
+	       aarch64_insn_is_load_pre(insn) ||
+	       aarch64_insn_is_load_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_pair(u32 insn)
+{
+	return aarch64_insn_is_ldp(insn) ||
+	       aarch64_insn_is_ldp_pre(insn) ||
+	       aarch64_insn_is_ldp_post(insn);
+}
+
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);
-- 
cgit v1.2.3


From 16c230b30de8b69ae75d2b98d04a77904da58d15 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 27 May 2021 11:55:29 +0100
Subject: arm64: scs: Drop unused 'tmp' argument to scs_{load, save} asm macros

The scs_load and scs_save asm macros don't make use of the mandatory
'tmp' register argument, so drop it and fix up the callers.

Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20210527105529.21967-1-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/scs.h | 8 ++++----
 arch/arm64/kernel/entry.S    | 8 ++++----
 arch/arm64/kernel/head.S     | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index eaa2cd92e4c1..8297bccf0784 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -9,18 +9,18 @@
 #ifdef CONFIG_SHADOW_CALL_STACK
 	scs_sp	.req	x18
 
-	.macro scs_load tsk, tmp
+	.macro scs_load tsk
 	ldr	scs_sp, [\tsk, #TSK_TI_SCS_SP]
 	.endm
 
-	.macro scs_save tsk, tmp
+	.macro scs_save tsk
 	str	scs_sp, [\tsk, #TSK_TI_SCS_SP]
 	.endm
 #else
-	.macro scs_load tsk, tmp
+	.macro scs_load tsk
 	.endm
 
-	.macro scs_save tsk, tmp
+	.macro scs_save tsk
 	.endm
 #endif /* CONFIG_SHADOW_CALL_STACK */
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 294f24e16fee..3153f1448cdb 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -275,7 +275,7 @@ alternative_else_nop_endif
 
 	mte_set_kernel_gcr x22, x23
 
-	scs_load tsk, x20
+	scs_load tsk
 	.else
 	add	x21, sp, #PT_REGS_SIZE
 	get_current_task tsk
@@ -375,7 +375,7 @@ alternative_if ARM64_WORKAROUND_845719
 alternative_else_nop_endif
 #endif
 3:
-	scs_save tsk, x0
+	scs_save tsk
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 alternative_if ARM64_HAS_ADDRESS_AUTH
@@ -979,8 +979,8 @@ SYM_FUNC_START(cpu_switch_to)
 	mov	sp, x9
 	msr	sp_el0, x1
 	ptrauth_keys_install_kernel x1, x8, x9, x10
-	scs_save x0, x8
-	scs_load x1, x8
+	scs_save x0
+	scs_load x1
 	ret
 SYM_FUNC_END(cpu_switch_to)
 NOKPROBE(cpu_switch_to)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 070ed53c049d..6a700526b117 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -412,7 +412,7 @@ SYM_FUNC_END(__create_page_tables)
 	stp	xzr, xzr, [sp, #S_STACKFRAME]
 	add	x29, sp, #S_STACKFRAME
 
-	scs_load \tsk, \tmp1
+	scs_load \tsk
 
 	adr_l	\tmp1, __per_cpu_offset
 	ldr	w\tmp2, [\tsk, #TSK_CPU]
-- 
cgit v1.2.3


From fdbef8c4e68ad423416aa6cc93d1616d6f8ac5b3 Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong1@huawei.com>
Date: Fri, 30 Apr 2021 09:26:59 +0800
Subject: arm_pmu: Fix write counter incorrect in ARMv7 big-endian mode

Commit 3a95200d3f89 ("arm_pmu: Change API to support 64bit counter values")
changes the input "value" type from 32-bit to 64-bit, which introduces the
following problem: ARMv7 PMU counters is 32-bit width, in big-endian mode,
write counter uses high 32-bit, which writes an incorrect value.

Before:

 Performance counter stats for 'ls':

              2.22 msec task-clock                #    0.675 CPUs utilized
                 0      context-switches          #    0.000 K/sec
                 0      cpu-migrations            #    0.000 K/sec
                49      page-faults               #    0.022 M/sec
        2150476593      cycles                    #  966.663 GHz
        2148588788      instructions              #    1.00  insn per cycle
        2147745484      branches                  # 965435.074 M/sec
        2147508540      branch-misses             #   99.99% of all branches

None of the above hw event counters are correct.

Solution:

"value" forcibly converted to 32-bit type before being written to PMU register.

After:

 Performance counter stats for 'ls':

              2.09 msec task-clock                #    0.681 CPUs utilized
                 0      context-switches          #    0.000 K/sec
                 0      cpu-migrations            #    0.000 K/sec
                46      page-faults               #    0.022 M/sec
           2807301      cycles                    #    1.344 GHz
           1060159      instructions              #    0.38  insn per cycle
            250496      branches                  #  119.914 M/sec
             23192      branch-misses             #    9.26% of all branches

Fixes: 3a95200d3f89 ("arm_pmu: Change API to support 64bit counter values")
Cc: <stable@vger.kernel.org>
Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210430012659.232110-1-yangjihong1@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm/kernel/perf_event_v7.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 2924d7910b10..eb2190477da1 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -773,10 +773,10 @@ static inline void armv7pmu_write_counter(struct perf_event *event, u64 value)
 		pr_err("CPU%u writing wrong counter %d\n",
 			smp_processor_id(), idx);
 	} else if (idx == ARMV7_IDX_CYCLE_COUNTER) {
-		asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (value));
+		asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" ((u32)value));
 	} else {
 		armv7_pmnc_select_counter(idx);
-		asm volatile("mcr p15, 0, %0, c9, c13, 2" : : "r" (value));
+		asm volatile("mcr p15, 0, %0, c9, c13, 2" : : "r" ((u32)value));
 	}
 }
 
-- 
cgit v1.2.3


From a5740e955540181f4ab8f076cc9795c6bbe4d730 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Thu, 20 May 2021 15:59:45 +0800
Subject: arm64: perf: Convert snprintf to sysfs_emit

Use sysfs_emit instead of snprintf to avoid buf overrun,because in
sysfs_emit it strictly checks whether buf is null or buf whether
pagesize aligned, otherwise it returns an error.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Link: https://lore.kernel.org/r/1621497585-30887-1-git-send-email-tiantao6@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index f594957e29bd..44b6eda69a81 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -312,7 +312,7 @@ static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
 	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
 	u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
 
-	return snprintf(page, PAGE_SIZE, "0x%08x\n", slots);
+	return sysfs_emit(page, "0x%08x\n", slots);
 }
 
 static DEVICE_ATTR_RO(slots);
-- 
cgit v1.2.3


From 2db5223731b79cf5c617dc391ceb21dd5cb93237 Mon Sep 17 00:00:00 2001
From: Hao Fang <fanghao11@huawei.com>
Date: Sat, 22 May 2021 18:23:57 +0800
Subject: drivers/perf: hisi: use the correct HiSilicon copyright

s/Hisilicon/HiSilicon/.
It should use capital S, according to the official website
https://www.hisilicon.com/en.

Signed-off-by: Hao Fang <fanghao11@huawei.com>
Link: https://lore.kernel.org/r/1621679037-15323-1-git-send-email-fanghao11@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 2 +-
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c  | 2 +-
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c  | 2 +-
 drivers/perf/hisilicon/hisi_uncore_pmu.c      | 2 +-
 drivers/perf/hisilicon/hisi_uncore_pmu.h      | 2 +-
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
index 0c7777bf1542..62299ab5a9be 100644
--- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -2,7 +2,7 @@
 /*
  * HiSilicon SoC DDRC uncore Hardware event counters support
  *
- * Copyright (C) 2017 Hisilicon Limited
+ * Copyright (C) 2017 HiSilicon Limited
  * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
  *         Anurup M <anurup.m@huawei.com>
  *
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 12eb41ab1b8a..12b2c5e6d488 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -2,7 +2,7 @@
 /*
  * HiSilicon SoC HHA uncore Hardware event counters support
  *
- * Copyright (C) 2017 Hisilicon Limited
+ * Copyright (C) 2017 HiSilicon Limited
  * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
  *         Anurup M <anurup.m@huawei.com>
  *
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index 773f69538090..560ab964c8b5 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -2,7 +2,7 @@
 /*
  * HiSilicon SoC L3C uncore Hardware event counters support
  *
- * Copyright (C) 2017 Hisilicon Limited
+ * Copyright (C) 2017 HiSilicon Limited
  * Author: Anurup M <anurup.m@huawei.com>
  *         Shaokun Zhang <zhangshaokun@hisilicon.com>
  *
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c
index 5842593632e4..a738aeab5c04 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -2,7 +2,7 @@
 /*
  * HiSilicon SoC Hardware event counters support
  *
- * Copyright (C) 2017 Hisilicon Limited
+ * Copyright (C) 2017 HiSilicon Limited
  * Author: Anurup M <anurup.m@huawei.com>
  *         Shaokun Zhang <zhangshaokun@hisilicon.com>
  *
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h
index ea9d89bbc1ea..7f5841d6f592 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.h
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h
@@ -2,7 +2,7 @@
 /*
  * HiSilicon SoC Hardware event counters support
  *
- * Copyright (C) 2017 Hisilicon Limited
+ * Copyright (C) 2017 HiSilicon Limited
  * Author: Anurup M <anurup.m@huawei.com>
  *         Shaokun Zhang <zhangshaokun@hisilicon.com>
  *
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 835ec3e2178f..08e028d9a406 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -2,7 +2,7 @@
 /*
  * HiSilicon SLLC uncore Hardware event counters support
  *
- * Copyright (C) 2020 Hisilicon Limited
+ * Copyright (C) 2020 HiSilicon Limited
  * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
  *
  * This code is based on the uncore PMUs like arm-cci and arm-ccn.
-- 
cgit v1.2.3


From 29c043760eea902f170b6485c6e88a5ef33a9908 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 28 May 2021 09:41:30 +0800
Subject: perf: arm_pmu: use DEVICE_ATTR_RO macro

Use DEVICE_ATTR_RO helper instead of plain DEVICE_ATTR,
which makes the code a bit shorter and easier to read.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20210528014130.7708-1-yuehaibing@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index e57b348c1628..a64e254a731b 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -563,14 +563,14 @@ static int armpmu_filter_match(struct perf_event *event)
 	return ret;
 }
 
-static ssize_t armpmu_cpumask_show(struct device *dev,
-				   struct device_attribute *attr, char *buf)
+static ssize_t cpus_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(dev_get_drvdata(dev));
 	return cpumap_print_to_pagebuf(true, buf, &armpmu->supported_cpus);
 }
 
-static DEVICE_ATTR(cpus, S_IRUGO, armpmu_cpumask_show, NULL);
+static DEVICE_ATTR_RO(cpus);
 
 static struct attribute *armpmu_common_attrs[] = {
 	&dev_attr_cpus.attr,
-- 
cgit v1.2.3


From ccbe14ce88289ede522318ef3205e46f8455bbf2 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 28 May 2021 09:47:49 +0800
Subject: perf: qcom: use DEVICE_ATTR_RO macro

Use DEVICE_ATTR_RO() helper instead of plain DEVICE_ATTR(),
which makes the code a bit shorter and easier to read.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20210528014749.24068-1-yuehaibing@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/qcom_l3_pmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
index bba078077c93..081273543c6b 100644
--- a/drivers/perf/qcom_l3_pmu.c
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -670,15 +670,15 @@ static const struct attribute_group qcom_l3_cache_pmu_events_group = {
 
 /* cpumask */
 
-static ssize_t qcom_l3_cache_pmu_cpumask_show(struct device *dev,
-				     struct device_attribute *attr, char *buf)
+static ssize_t cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
 {
 	struct l3cache_pmu *l3pmu = to_l3cache_pmu(dev_get_drvdata(dev));
 
 	return cpumap_print_to_pagebuf(true, buf, &l3pmu->cpumask);
 }
 
-static DEVICE_ATTR(cpumask, 0444, qcom_l3_cache_pmu_cpumask_show, NULL);
+static DEVICE_ATTR_RO(cpumask);
 
 static struct attribute *qcom_l3_cache_pmu_cpumask_attrs[] = {
 	&dev_attr_cpumask.attr,
-- 
cgit v1.2.3


From 21ad02e6b4c822d453faead4c96f0a86c4541b62 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 28 May 2021 09:49:40 +0800
Subject: perf: xgene_pmu: use DEVICE_ATTR_RO macro

Use DEVICE_ATTR_RO() helper instead of plain DEVICE_ATTR(),
which makes the code a bit shorter and easier to read.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20210528014940.4184-1-yuehaibing@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/xgene_pmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index ffe3bdeec845..62d942534a6b 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -604,15 +604,15 @@ static const struct attribute_group mc_pmu_v3_events_attr_group = {
 /*
  * sysfs cpumask attributes
  */
-static ssize_t xgene_pmu_cpumask_show(struct device *dev,
-				      struct device_attribute *attr, char *buf)
+static ssize_t cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
 {
 	struct xgene_pmu_dev *pmu_dev = to_pmu_dev(dev_get_drvdata(dev));
 
 	return cpumap_print_to_pagebuf(true, buf, &pmu_dev->parent->cpu);
 }
 
-static DEVICE_ATTR(cpumask, S_IRUGO, xgene_pmu_cpumask_show, NULL);
+static DEVICE_ATTR_RO(cpumask);
 
 static struct attribute *xgene_pmu_cpumask_attrs[] = {
 	&dev_attr_cpumask.attr,
-- 
cgit v1.2.3


From f9e36b388a325eee74fae3b545f64449c13f090a Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 28 May 2021 14:17:38 +0800
Subject: perf: arm_spe: use DEVICE_ATTR_RO macro

Use DEVICE_ATTR_RO() helper instead of plain DEVICE_ATTR(),
which makes the code a bit shorter and easier to read.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20210528061738.23392-1-yuehaibing@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_spe_pmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index e3711cb4c1b5..d44bcc29d99c 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -231,15 +231,14 @@ static const struct attribute_group arm_spe_pmu_format_group = {
 	.attrs	= arm_spe_pmu_formats_attr,
 };
 
-static ssize_t arm_spe_pmu_get_attr_cpumask(struct device *dev,
-					    struct device_attribute *attr,
-					    char *buf)
+static ssize_t cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
 {
 	struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev);
 
 	return cpumap_print_to_pagebuf(true, buf, &spe_pmu->supported_cpus);
 }
-static DEVICE_ATTR(cpumask, S_IRUGO, arm_spe_pmu_get_attr_cpumask, NULL);
+static DEVICE_ATTR_RO(cpumask);
 
 static struct attribute *arm_spe_pmu_attrs[] = {
 	&dev_attr_cpumask.attr,
-- 
cgit v1.2.3


From 43de30d36742dbbde22f2ad526c3e5a403c271e2 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <sam.tebbs@arm.com>
Date: Thu, 27 May 2021 16:34:41 +0100
Subject: arm64: Import latest version of Cortex Strings' memcmp

Import the latest version of the former Cortex Strings - now
Arm Optimized Routines - memcmp function based on the upstream
code of string/aarch64/memcmp.S at commit e823e3a from
https://github.com/ARM-software/optimized-routines

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: update attribution and commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/2889de2d41054f3f508fb3addad784a3606ef383.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/memcmp.S | 346 +++++++++++++++++-------------------------------
 1 file changed, 119 insertions(+), 227 deletions(-)

diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index c0671e793ea9..498f0d9941d9 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,247 +1,139 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2020, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-*  x0 - const memory area 1 pointer
-*  x1 - const memory area 2 pointer
-*  x2 - the maximal compare byte length
-* Returns:
-*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(label) .L ## label
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-endloop		.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-pos		.req	x11
-limit_wd	.req	x12
-mask		.req	x13
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 SYM_FUNC_START_WEAK_PI(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
-	/*
-	* The input source addresses are at alignment boundary.
-	* Directly compare eight bytes each time.
-	*/
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
-
-	/* Not reached the limit, must have found a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
-
-	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-	/*
-	* The remained bytes less than 8. It is needed to extract valid data
-	* from last eight bytes of the intended memory range.
-	*/
-	lsl	limit, limit, #3	/* bytes-> bits.  */
-	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-	b	.Lnot_limit
-
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary. Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	*/
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	/*
-	* We can not add limit with alignment offset(tmp1) here. Since the
-	* addition probably make the limit overflown.
-	*/
-	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	add	tmp3, tmp3, tmp1
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
-
-	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
-	neg	tmp1, tmp1/* Bits to alignment -64.  */
-	mov	tmp2, #~0
-	/*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
-	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )
-
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	.Lstart_realigned
-
-	/*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
-
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
-	sub	limit, limit, pos
-	/*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*diff occurred before the last byte.*/
-	cmp	data1w, data2w
-	b.eq	.Lstart_align
-1:
-	sub	result, data1, data2
+	subs	limit, limit, 8
+	b.lo	L(less8)
+
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp	data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
 	ret
 
-.Lstart_align:
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make src1 aligned...*/
-	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
-	add	src2, src2, tmp3
-	sub	limit, limit, tmp3
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*load 8 bytes from aligned SRC1..*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2  /*Non-zero if differences found.*/
-	csinv	endloop, diff, xzr, ne
-	cbnz	endloop, .Lunequal_proc
-	/*How far is the current SRC2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes and compare from
-	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
-	* the second part's comparison. Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	cbnz	diff, .Lnot_limit
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	subs	limit_wd, limit_wd, #1
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	cbz	endloop, .Lloopcmp_proc
-.Lunequal_proc:
-	cbz	diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev	diff, diff )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* The MS-non-zero bit of DIFF marks either the first bit
-	* that is different, or the end of the significant data.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* We need to zero-extend (char is unsigned) the value and then
-	* perform a signed subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
 	ret
 
-.Lremain8:
-	/* Limit % 8 == 0 =>. all data are equal.*/
-	ands	limit, limit, #7
-	b.eq	.Lret0
-
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
-.Lret0:
-	mov	result, #0
-	ret
 SYM_FUNC_END_PI(memcmp)
 EXPORT_SYMBOL_NOKASAN(memcmp)
-- 
cgit v1.2.3


From 758602c04409d8c5a092cef570b2de125ce0f2ae Mon Sep 17 00:00:00 2001
From: Sam Tebbs <sam.tebbs@arm.com>
Date: Thu, 27 May 2021 16:34:42 +0100
Subject: arm64: Import latest version of Cortex Strings' strcmp

Import the latest version of the former Cortex Strings - now
Arm Optimized Routines - strcmp function based on the upstream
code of string/aarch64/strcmp.S at commit afd6244 from
https://github.com/ARM-software/optimized-routines

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: update attribution and commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/0fe90c90b96b569fbdfd46e47bd1298abb02079e.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/strcmp.S | 289 ++++++++++++++++++++----------------------------
 1 file changed, 121 insertions(+), 168 deletions(-)

diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 4e79566726c8..e82ccb6c2f93 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -1,84 +1,123 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2020, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strcmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * compare two strings
+/* Assumptions:
  *
- * Parameters:
- *	x0 - const string 1 pointer
- *    x1 - const string 2 pointer
- * Returns:
- * x0 - an integer less than, equal to, or greater than zero
- * if  s1  is  found, respectively, to be less than, to match,
- * or be greater than s2.
+ * ARMv8-a, AArch64
  */
 
+#define L(label) .L ## label
+
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define result		x0
 
 /* Internal variables.  */
-data1		.req	x2
-data1w		.req	w2
-data2		.req	x3
-data2w		.req	w3
-has_nul		.req	x4
-diff		.req	x5
-syndrome	.req	x6
-tmp1		.req	x7
-tmp2		.req	x8
-tmp3		.req	x9
-zeroones	.req	x10
-pos		.req	x11
-
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+	.align 6
 SYM_FUNC_START_WEAK_PI(strcmp)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
-	b.ne	.Lmisaligned8
+	b.ne	L(misaligned8)
 	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-
-	/*
-	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	* can be done in parallel across the entire word.
-	*/
-.Lloop_aligned:
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lloop_aligned
-	b	.Lcal_cmpresult
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+L(end):
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
 
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary.  Round down the addresses and then mask off
-	* the bytes that preceed the start point.
-	*/
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
@@ -86,138 +125,52 @@ SYM_FUNC_START_WEAK_PI(strcmp)
 	neg	tmp1, tmp1		/* Bits to alignment -64.  */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
+#ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
 	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
-
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	b	.Lstart_realigned
-
-.Lmisaligned8:
-	/*
-	* Get the align offset length to compare per byte first.
-	* After this process, one string's address will be aligned.
-	*/
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */
-.Ltinycmp:
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*find the null or unequal...*/
 	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs
-	b.eq	.Lstart_align /*the last bytes are equal....*/
-1:
-	sub	result, data1, data2
-	ret
-
-.Lstart_align:
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make str1 aligned...*/
-	add	src1, src1, tmp3
-	add	src2, src2, tmp3
-	/*load 8 bytes from aligned str1 and non-aligned str2..*/
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	eor	diff, data1, data2 /* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	cbnz	syndrome, .Lcal_cmpresult
-	/*How far is the current str2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-.Lrecal_offset:
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes from the SRC2 alignment
-	* boundary,then compare with the relative bytes from SRC1.
-	* If all 8 bytes are equal,then start the second part's comparison.
-	* Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	cbnz	syndrome, .Lcal_cmpresult
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lloopcmp_proc
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
 
-.Lcal_cmpresult:
-	/*
-	* reversed the byte-order as big-endian,then CLZ can find the most
-	* significant zero bits.
-	*/
-CPU_LE( rev	syndrome, syndrome )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* For big-endian we cannot use the trick with the syndrome value
-	* as carry-propagation can corrupt the upper bits if the trailing
-	* bytes in the string contain 0x01.
-	* However, if there is no NUL byte in the dword, we can generate
-	* the result directly.  We cannot just subtract the bytes as the
-	* MSB might be significant.
-	*/
-CPU_BE( cbnz	has_nul, 1f )
-CPU_BE( cmp	data1, data2 )
-CPU_BE( cset	result, ne )
-CPU_BE( cneg	result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
-	/*Re-compute the NUL-byte detection, using a byte-reversed value. */
-CPU_BE(	rev	tmp3, data1 )
-CPU_BE(	sub	tmp1, tmp3, zeroones )
-CPU_BE(	orr	tmp2, tmp3, #REP8_7f )
-CPU_BE(	bic	has_nul, tmp1, tmp2 )
-CPU_BE(	rev	has_nul, has_nul )
-CPU_BE(	orr	syndrome, diff, has_nul )
-
-	clz	pos, syndrome
-	/*
-	* The MS-non-zero bit of the syndrome marks either the first bit
-	* that is different, or the top bit of the first zero byte.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* But we need to zero-extend (char is unsigned) the value and then
-	* perform a signed 32-bit subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+L(done):
+	sub	result, data1, data2
 	ret
+
 SYM_FUNC_END_PI(strcmp)
 EXPORT_SYMBOL_NOKASAN(strcmp)
-- 
cgit v1.2.3


From 325a1de81287a3d4ea2b8e6528a534c6c3a7c608 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <sam.tebbs@arm.com>
Date: Thu, 27 May 2021 16:34:43 +0100
Subject: arm64: Import updated version of Cortex Strings' strlen

Import an updated version of the former Cortex Strings - now Arm
Optimized Routines - strcmp function. The latest version introduces
Advanced SIMD usage which rules it out for our purposes, but we can
still pick an intermediate improvement from the previous version,
namely string/aarch64/strlen.S at commit 98e4d6a from
https://github.com/ARM-software/optimized-routines

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: update attribution and commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/32e3489398a24b23ae6e996935ac4818f8fd9dfd.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/strlen.S | 258 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 173 insertions(+), 85 deletions(-)

diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index ee3ed882dd79..b557185b54a5 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -1,115 +1,203 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strlen.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * calculate the length of a string
+/* Assumptions:
  *
- * Parameters:
- *	x0 - const string pointer
- * Returns:
- *	x0 - the return length of specific string
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
+#define L(label) .L ## label
+
 /* Arguments and results.  */
-srcin		.req	x0
-len		.req	x0
+#define srcin		x0
+#define len		x0
 
 /* Locals and temporaries.  */
-src		.req	x1
-data1		.req	x2
-data2		.req	x3
-data2a		.req	x4
-has_nul1	.req	x5
-has_nul2	.req	x6
-tmp1		.req	x7
-tmp2		.req	x8
-tmp3		.req	x9
-tmp4		.req	x10
-zeroones	.req	x11
-pos		.req	x12
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
+#define MIN_PAGE_SIZE 4096
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
 SYM_FUNC_START_WEAK_PI(strlen)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/*
-	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	* can be done in parallel across the entire word.
-	*/
-	/*
-	* The inner loop deals with two Dwords at a time. This has a
-	* slightly higher start-up cost, but we should win quite quickly,
-	* especially on cores with a high number of issue slots per
-	* cycle, as we get much better parallelism out of the operations.
-	*/
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lloop
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
 
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+L(page_cross_entry):
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
+#endif
 	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-CPU_BE(	mov	data2, data1 )	/*prepare data to re-calculate the syndrome*/
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
-	/*
-	* For big-endian, carry propagation (if the final byte in the
-	* string is 0x01) means we cannot use has_nul directly.  The
-	* easiest way to get the correct byte is to byte-swap the data
-	* and calculate the syndrome a second time.
-	*/
-CPU_BE( rev	data2, data2 )
-CPU_BE( sub	tmp1, data2, zeroones )
-CPU_BE( orr	tmp2, data2, #REP8_7f )
-CPU_BE( bic	has_nul2, tmp1, tmp2 )
-
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
 	ret
 
-.Lmisaligned:
-	cmp	tmp1, #8
-	neg	tmp1, tmp1
-	ldp	data1, data2, [src], #16
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	mov	tmp2, #~0
-	/* Big-endian.  Early bytes are at MSB.  */
-CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
+#ifdef __AARCH64EB__
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#else
 	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
 
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
 SYM_FUNC_END_PI(strlen)
 EXPORT_SYMBOL_NOKASAN(strlen)
-- 
cgit v1.2.3


From 020b199bc70d98d92e1bbc6a71358d7293ebc5ea Mon Sep 17 00:00:00 2001
From: Sam Tebbs <sam.tebbs@arm.com>
Date: Thu, 27 May 2021 16:34:44 +0100
Subject: arm64: Import latest version of Cortex Strings' strncmp

Import the latest version of the former Cortex Strings - now
Arm Optimized Routines - strncmp function based on the upstream
code of string/aarch64/strncmp.S at commit e823e3a from
https://github.com/ARM-software/optimized-routines

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: update attribution and commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/26110bee02ad360596c9a7536af7eaaf6890d0e8.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/strncmp.S | 406 +++++++++++++++++++++--------------------------
 1 file changed, 184 insertions(+), 222 deletions(-)

diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index 2a7ee949ed47..0c0bf5462de0 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -1,299 +1,261 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strncmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * compare two strings
+/* Assumptions:
  *
- * Parameters:
- *  x0 - const string 1 pointer
- *  x1 - const string 2 pointer
- *  x2 - the maximal length to be compared
- * Returns:
- *  x0 - an integer less than, equal to, or greater than zero if s1 is found,
- *     respectively, to be less than, to match, or be greater than s2.
+ * ARMv8-a, AArch64
  */
 
+#define L(label) .L ## label
+
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-syndrome	.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-zeroones	.req	x11
-pos		.req	x12
-limit_wd	.req	x13
-mask		.req	x14
-endloop		.req	x15
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define limit_wd	x13
+#define mask		x14
+#define endloop		x15
+#define count		mask
 
 SYM_FUNC_START_WEAK_PI(strncmp)
-	cbz	limit, .Lret0
+	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
-	/*
-	* when limit is mulitply of 8, if not sub 1,
-	* the judgement of last dword will wrong.
-	*/
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3  /* Convert to Dwords.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
-	/*
-	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	* can be done in parallel across the entire word.
-	*/
-.Lloop_aligned:
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl  /* Last Dword or differences.*/
-	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
-	b.eq	.Lloop_aligned
+	b.eq	L(loop_aligned)
+	/* End of main loop */
 
-	/*Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, L(not_limit)
 
 	/* Limit % 8 == 0 => all bytes significant.  */
 	ands	limit, limit, #7
-	b.eq	.Lnot_limit
+	b.eq	L(not_limit)
 
-	lsl	limit, limit, #3    /* Bits -> bytes.  */
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
 	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
 	bic	data1, data1, mask
 	bic	data2, data2, mask
 
 	/* Make sure that the NUL byte is marked in the syndrome.  */
 	orr	has_nul, has_nul, mask
 
-.Lnot_limit:
+L(not_limit):
 	orr	syndrome, diff, has_nul
-	b	.Lcal_cmpresult
 
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary.  Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	* We also need to adjust the limit calculations, but without
-	* overflowing if the limit is near ULONG_MAX.
-	*/
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+#ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-CPU_BE( lsl	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#else
 	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */
-
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
-	b	.Lstart_realigned
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
 
-/*when src1 offset is not equal to src2 offset...*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8... */
-	/*
-	* Get the align offset length to compare per byte first.
-	* After this process, one string's address will be aligned.*/
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */
-	/*
-	* Here, limit is not less than 8, so directly run .Ltinycmp
-	* without checking the limit.*/
-	sub	limit, limit, pos
-.Ltinycmp:
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*find the null or unequal...*/
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs
-	b.eq	.Lstart_align /*the last bytes are equal....*/
-1:
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	ret
-
-.Lstart_align:
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
 	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*process more leading bytes to make str1 aligned...*/
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	add	src1, src1, tmp3	/*tmp3 is positive in this branch.*/
-	add	src2, src2, tmp3
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+	cbz	count, L(do_misaligned)
 
-	sub	limit, limit, tmp3
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
 	lsr	limit_wd, limit, #3
-	subs	limit_wd, limit_wd, #1
 
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	bics	has_nul, tmp1, tmp2
-	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
-	b.ne	.Lunequal_proc
-	/*How far is the current str2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-.Lrecal_offset:
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes from the SRC2 alignment
-	* boundary,then compare with the relative bytes from SRC1.
-	* If all 8 bytes are equal,then start the second part's comparison.
-	* Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, eq
-	cbnz	endloop, .Lunequal_proc
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
 
-	/*The second part process*/
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	bics	has_nul, tmp1, tmp2
-	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
-	b.eq	.Lloopcmp_proc
-
-.Lunequal_proc:
-	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lremain8
-.Lcal_cmpresult:
-	/*
-	* reversed the byte-order as big-endian,then CLZ can find the most
-	* significant zero bits.
-	*/
-CPU_LE( rev	syndrome, syndrome )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-	/*
-	* For big-endian we cannot use the trick with the syndrome value
-	* as carry-propagation can corrupt the upper bits if the trailing
-	* bytes in the string contain 0x01.
-	* However, if there is no NUL byte in the dword, we can generate
-	* the result directly.  We can't just subtract the bytes as the
-	* MSB might be significant.
-	*/
-CPU_BE( cbnz	has_nul, 1f )
-CPU_BE( cmp	data1, data2 )
-CPU_BE( cset	result, ne )
-CPU_BE( cneg	result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.*/
-CPU_BE( rev	tmp3, data1 )
-CPU_BE( sub	tmp1, tmp3, zeroones )
-CPU_BE( orr	tmp2, tmp3, #REP8_7f )
-CPU_BE( bic	has_nul, tmp1, tmp2 )
-CPU_BE( rev	has_nul, has_nul )
-CPU_BE( orr	syndrome, diff, has_nul )
-	/*
-	* The MS-non-zero bit of the syndrome marks either the first bit
-	* that is different, or the top bit of the first zero byte.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, syndrome
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* But we need to zero-extend (char is unsigned) the value and then
-	* perform a signed 32-bit subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lremain8:
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	.Lret0
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
 
-	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
 
-.Lret0:
+L(ret0):
 	mov	result, #0
 	ret
+
 SYM_FUNC_END_PI(strncmp)
 EXPORT_SYMBOL_NOKASAN(strncmp)
-- 
cgit v1.2.3


From b6c4ea48415d26ec08fb67fbbd3eefdb1f96ffa6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 27 May 2021 16:34:45 +0100
Subject: arm64: Add assembly annotations for weak-PI-alias madness

Add yet another set of assembly symbol annotations, this time for the
borderline-absurd situation of a function aliasing to a weak symbol
which itself also wants a position-independent alias.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/75545b3c4129b20b887474bb58a9cf302bf2132b.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/linkage.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index ba89a9af820a..9906541a6861 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -56,8 +56,16 @@
 		SYM_FUNC_START_ALIAS(__pi_##x);	\
 		SYM_FUNC_START_WEAK(x)
 
+#define SYM_FUNC_START_WEAK_ALIAS_PI(x)		\
+		SYM_FUNC_START_ALIAS(__pi_##x);	\
+		SYM_START(x, SYM_L_WEAK, SYM_A_ALIGN)
+
 #define SYM_FUNC_END_PI(x)			\
 		SYM_FUNC_END(x);		\
 		SYM_FUNC_END_ALIAS(__pi_##x)
 
+#define SYM_FUNC_END_ALIAS_PI(x)		\
+		SYM_FUNC_END_ALIAS(x);		\
+		SYM_FUNC_END_ALIAS(__pi_##x)
+
 #endif
-- 
cgit v1.2.3


From 285133040e6ce0e6f37db962f2b4dad10ea46da0 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 27 May 2021 16:34:46 +0100
Subject: arm64: Import latest memcpy()/memmove() implementation

Import the latest implementation of memcpy(), based on the
upstream code of string/aarch64/memcpy.S at commit afd6244 from
https://github.com/ARM-software/optimized-routines, and subsuming
memmove() in the process.

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Note also that the needs of the usercopy routines vs. regular memcpy()
have now diverged so far that we abandon the shared template idea
and the damage which that incurred to the tuning of LDP/STP loops.
We'll be back to tackle those routines separately in future.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/3c953af43506581b2422f61952261e76949ba711.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/Makefile  |   2 +-
 arch/arm64/lib/memcpy.S  | 272 +++++++++++++++++++++++++++++++++++++++--------
 arch/arm64/lib/memmove.S | 189 --------------------------------
 3 files changed, 230 insertions(+), 233 deletions(-)
 delete mode 100644 arch/arm64/lib/memmove.S

diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..01c596aa539c 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
-		   clear_page.o csum.o memchr.o memcpy.o memmove.o	\
+		   clear_page.o csum.o memchr.o memcpy.o		\
 		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
 		   strnlen.o strchr.o strrchr.o tishift.o
 
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index dc8d2a216a6e..31073a8304fb 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,66 +1,252 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2020, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/cache.h>
 
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
  *
- * Parameters:
- *	x0 - dest
- *	x1 - src
- *	x2 - n
- * Returns:
- *	x0 - dest
  */
-	.macro ldrb1 reg, ptr, val
-	ldrb  \reg, [\ptr], \val
-	.endm
-
-	.macro strb1 reg, ptr, val
-	strb \reg, [\ptr], \val
-	.endm
 
-	.macro ldrh1 reg, ptr, val
-	ldrh  \reg, [\ptr], \val
-	.endm
+#define L(label) .L ## label
 
-	.macro strh1 reg, ptr, val
-	strh \reg, [\ptr], \val
-	.endm
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_lw	w10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	dst
+#define H_l	src
+#define H_h	srcend
+#define tmp1	x14
 
-	.macro ldr1 reg, ptr, val
-	ldr \reg, [\ptr], \val
-	.endm
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
 
-	.macro str1 reg, ptr, val
-	str \reg, [\ptr], \val
-	.endm
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
 
-	.macro ldp1 reg1, reg2, ptr, val
-	ldp \reg1, \reg2, [\ptr], \val
-	.endm
-
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
-	.endm
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
 
+SYM_FUNC_START_ALIAS(__memmove)
+SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
 SYM_FUNC_START_ALIAS(__memcpy)
 SYM_FUNC_START_WEAK_PI(memcpy)
-#include "copy_template.S"
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp	D_l, D_h, [src]
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+	ret
+
 SYM_FUNC_END_PI(memcpy)
 EXPORT_SYMBOL(memcpy)
 SYM_FUNC_END_ALIAS(__memcpy)
 EXPORT_SYMBOL(__memcpy)
+SYM_FUNC_END_ALIAS_PI(memmove)
+EXPORT_SYMBOL(memmove)
+SYM_FUNC_END_ALIAS(__memmove)
+EXPORT_SYMBOL(__memmove)
\ No newline at end of file
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
deleted file mode 100644
index 1035dce4bdaf..000000000000
--- a/arch/arm64/lib/memmove.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
- *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Move a buffer from src to test (alignment handled by the hardware).
- * If dest <= src, call memcpy, otherwise copy in reverse order.
- *
- * Parameters:
- *	x0 - dest
- *	x1 - src
- *	x2 - n
- * Returns:
- *	x0 - dest
- */
-dstin	.req	x0
-src	.req	x1
-count	.req	x2
-tmp1	.req	x3
-tmp1w	.req	w3
-tmp2	.req	x4
-tmp2w	.req	w4
-tmp3	.req	x5
-tmp3w	.req	w5
-dst	.req	x6
-
-A_l	.req	x7
-A_h	.req	x8
-B_l	.req	x9
-B_h	.req	x10
-C_l	.req	x11
-C_h	.req	x12
-D_l	.req	x13
-D_h	.req	x14
-
-SYM_FUNC_START_ALIAS(__memmove)
-SYM_FUNC_START_WEAK_PI(memmove)
-	cmp	dstin, src
-	b.lo	__memcpy
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	__memcpy		/* No overlap.  */
-
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #16
-	b.lo	.Ltail15  /*probably non-alignment accesses.*/
-
-	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
-	b.eq	.LSrcAligned
-	sub	count, count, tmp2
-	/*
-	* process the aligned offset length to make the src aligned firstly.
-	* those extra instructions' cost is acceptable. It also make the
-	* coming accesses are based on aligned address.
-	*/
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-	tbz	tmp2, #1, 2f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-2:
-	tbz	tmp2, #2, 3f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-3:
-	tbz	tmp2, #3, .LSrcAligned
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-
-.LSrcAligned:
-	cmp	count, #64
-	b.ge	.Lcpy_over64
-
-	/*
-	* Deal with small copies quickly by dropping straight into the
-	* exit block.
-	*/
-.Ltail63:
-	/*
-	* Copy up to 48 bytes of data. At this point we only need the
-	* bottom 6 bits of count to be accurate.
-	*/
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-16]!
-	stp	A_l, A_h, [dst, #-16]!
-1:
-	ldp	A_l, A_h, [src, #-16]!
-	stp	A_l, A_h, [dst, #-16]!
-2:
-	ldp	A_l, A_h, [src, #-16]!
-	stp	A_l, A_h, [dst, #-16]!
-
-.Ltail15:
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 2f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-2:
-	tbz	count, #1, 3f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-3:
-	tbz	count, #0, .Lexitfunc
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-
-.Lexitfunc:
-	ret
-
-.Lcpy_over64:
-	subs	count, count, #128
-	b.ge	.Lcpy_body_large
-	/*
-	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
-	* to the tail.
-	*/
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-	stp	D_l, D_h, [dst, #-64]!
-
-	tst	count, #0x3f
-	b.ne	.Ltail63
-	ret
-
-	/*
-	* Critical loop. Start at a new cache line boundary. Assuming
-	* 64 bytes per line this ensures the entire loop is in one line.
-	*/
-	.p2align	L1_CACHE_SHIFT
-.Lcpy_body_large:
-	/* pre-load 64 bytes data. */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	/*
-	* interlace the load of next 64 bytes data block with store of the last
-	* loaded 64 bytes data.
-	*/
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-
-	tst	count, #0x3f
-	b.ne	.Ltail63
-	ret
-SYM_FUNC_END_PI(memmove)
-EXPORT_SYMBOL(memmove)
-SYM_FUNC_END_ALIAS(__memmove)
-EXPORT_SYMBOL(__memmove)
-- 
cgit v1.2.3


From 9e51cafd783b22018fb15bfb06d65f69349223a9 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 27 May 2021 16:34:47 +0100
Subject: arm64: Better optimised memchr()

Although we implement our own assembly version of memchr(), it turns
out to be barely any better than what GCC can generate for the generic
C version (and would go wrong if the size_t argument were ever large
enough to be interpreted as negative). Unfortunately we can't import the
tuned implementation from the Arm optimized-routines library, since that
has some Advanced SIMD parts which are not really viable for general
kernel library code. What we can do, however, is pep things up with some
relatively straightforward word-at-a-time logic for larger calls.

Adding some timing to optimized-routines' memchr() test for a simple
benchmark, overall this version comes in around half as fast as the SIMD
code, but still nearly 4x faster than our existing implementation.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/58471b42f9287e039dafa9e5e7035077152438fd.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/memchr.S | 65 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index edf6b970a277..7c2276fdab54 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -1,9 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Based on arch/arm/lib/memchr.S
- *
- * Copyright (C) 1995-2000 Russell King
- * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
  */
 
 #include <linux/linkage.h>
@@ -19,16 +16,60 @@
  * Returns:
  *	x0 - address of first occurrence of 'c' or 0
  */
+
+#define L(label) .L ## label
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define wordcnt		x3
+#define rep01		x4
+#define repchr		x5
+#define cur_word	x6
+#define cur_byte	w6
+#define tmp		x7
+#define tmp2		x8
+
+	.p2align 4
+	nop
 SYM_FUNC_START_WEAK_PI(memchr)
-	and	w1, w1, #0xff
-1:	subs	x2, x2, #1
-	b.mi	2f
-	ldrb	w3, [x0], #1
-	cmp	w3, w1
-	b.ne	1b
-	sub	x0, x0, #1
+	and	chrin, chrin, #0xff
+	lsr	wordcnt, cntin, #3
+	cbz	wordcnt, L(byte_loop)
+	mov	rep01, #REP8_01
+	mul	repchr, x1, rep01
+	and	cntin, cntin, #7
+L(word_loop):
+	ldr	cur_word, [srcin], #8
+	sub	wordcnt, wordcnt, #1
+	eor	cur_word, cur_word, repchr
+	sub	tmp, cur_word, rep01
+	orr	tmp2, cur_word, #REP8_7f
+	bics	tmp, tmp, tmp2
+	b.ne	L(found_word)
+	cbnz	wordcnt, L(word_loop)
+L(byte_loop):
+	cbz	cntin, L(not_found)
+	ldrb	cur_byte, [srcin], #1
+	sub	cntin, cntin, #1
+	cmp	cur_byte, chrin
+	b.ne	L(byte_loop)
+	sub	srcin, srcin, #1
+	ret
+L(found_word):
+CPU_LE(	rev	tmp, tmp)
+	clz	tmp, tmp
+	sub	tmp, tmp, #64
+	add	result, srcin, tmp, asr #3
 	ret
-2:	mov	x0, #0
+L(not_found):
+	mov	result, #0
 	ret
 SYM_FUNC_END_PI(memchr)
 EXPORT_SYMBOL_NOKASAN(memchr)
-- 
cgit v1.2.3


From 344323e0428b9911406bede6cff23d1482c19eae Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 27 May 2021 16:34:48 +0100
Subject: arm64: Rewrite __arch_clear_user()

Now that we're always using STTR variants rather than abstracting two
different addressing modes, the user_ldst macro here is frankly more
obfuscating than helpful. Rewrite __arch_clear_user() with regular
USER() annotations so that it's clearer what's going on, and take the
opportunity to minimise the branchiness in the most common paths, while
also allowing the exception fixup to return an accurate result.

Apparently some folks examine large reads from /dev/zero closely enough
to notice the loop being hot, so align it per the other critical loops
(presumably around a typical instruction fetch granularity).

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1cbd78b12c076a8ad4656a345811cfb9425df0b3.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/clear_user.S | 47 ++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index af9afcbec92c..a7efb2ad2a1c 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -1,12 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Based on arch/arm/lib/clear_user.S
- *
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
  */
-#include <linux/linkage.h>
 
-#include <asm/asm-uaccess.h>
+#include <linux/linkage.h>
 #include <asm/assembler.h>
 
 	.text
@@ -19,25 +16,33 @@
  *
  * Alignment fixed up by hardware.
  */
+
+	.p2align 4
+	// Alignment is for the loop, but since the prologue (including BTI)
+	// is also 16 bytes we can keep any padding outside the function
 SYM_FUNC_START(__arch_clear_user)
-	mov	x2, x1			// save the size for fixup return
+	add	x2, x0, x1
 	subs	x1, x1, #8
 	b.mi	2f
 1:
-user_ldst 9f, sttr, xzr, x0, 8
+USER(9f, sttr	xzr, [x0])
+	add	x0, x0, #8
 	subs	x1, x1, #8
-	b.pl	1b
-2:	adds	x1, x1, #4
-	b.mi	3f
-user_ldst 9f, sttr, wzr, x0, 4
-	sub	x1, x1, #4
-3:	adds	x1, x1, #2
-	b.mi	4f
-user_ldst 9f, sttrh, wzr, x0, 2
-	sub	x1, x1, #2
-4:	adds	x1, x1, #1
-	b.mi	5f
-user_ldst 9f, sttrb, wzr, x0, 0
+	b.hi	1b
+USER(9f, sttr	xzr, [x2, #-8])
+	mov	x0, #0
+	ret
+
+2:	tbz	x1, #2, 3f
+USER(9f, sttr	wzr, [x0])
+USER(8f, sttr	wzr, [x2, #-4])
+	mov	x0, #0
+	ret
+
+3:	tbz	x1, #1, 4f
+USER(9f, sttrh	wzr, [x0])
+4:	tbz	x1, #0, 5f
+USER(7f, sttrb	wzr, [x2, #-1])
 5:	mov	x0, #0
 	ret
 SYM_FUNC_END(__arch_clear_user)
@@ -45,6 +50,8 @@ EXPORT_SYMBOL(__arch_clear_user)
 
 	.section .fixup,"ax"
 	.align	2
-9:	mov	x0, x2			// return the original size
+7:	sub	x0, x2, #5	// Adjust for faulting on the final byte...
+8:	add	x0, x0, #4	// ...or the second word of the 4-7 byte case
+9:	sub	x0, x2, x0
 	ret
 	.previous
-- 
cgit v1.2.3


From 5ae632ed356c2f2e42a3e7ea447e98a9e684539c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 29 May 2021 19:15:10 +0800
Subject: arm64: mm: Use better bitmap_zalloc()

Use better bitmap_zalloc() to allocate bitmap.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Link: https://lore.kernel.org/r/20210529111510.186355-1-wangkefeng.wang@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/context.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 001737a8f309..cd72576ae2b7 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -402,14 +402,12 @@ static int asids_init(void)
 {
 	asid_bits = get_cpu_asid_bits();
 	atomic64_set(&asid_generation, ASID_FIRST_VERSION);
-	asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS), sizeof(*asid_map),
-			   GFP_KERNEL);
+	asid_map = bitmap_zalloc(NUM_USER_ASIDS, GFP_KERNEL);
 	if (!asid_map)
 		panic("Failed to allocate bitmap for %lu ASIDs\n",
 		      NUM_USER_ASIDS);
 
-	pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS),
-				  sizeof(*pinned_asid_map), GFP_KERNEL);
+	pinned_asid_map = bitmap_zalloc(NUM_USER_ASIDS, GFP_KERNEL);
 	nr_pinned_asids = 0;
 
 	/*
-- 
cgit v1.2.3


From 58cc6b72a2127475296502fcb4d2b5006b7f4742 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 27 May 2021 12:03:17 +0100
Subject: arm64: mm: Remove unused support for Device-GRE memory type

The Device-GRE memory type is unused, so remove it and reclaim a MAIR.

Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210505180228.GA3874@arm.com
Link: https://lore.kernel.org/r/20210527110319.22157-2-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/memory.h | 1 -
 arch/arm64/include/asm/sysreg.h | 1 -
 arch/arm64/mm/proc.S            | 1 -
 arch/arm64/mm/ptdump.c          | 4 ----
 4 files changed, 7 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 87b90dc27a43..1e025e3b655e 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -138,7 +138,6 @@
 #define MT_NORMAL_WT		3
 #define MT_DEVICE_nGnRnE	4
 #define MT_DEVICE_nGnRE		5
-#define MT_DEVICE_GRE		6
 
 /*
  * Memory types for Stage-2 translation
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 65d15700a168..baeb33cd7685 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -703,7 +703,6 @@
 /* MAIR_ELx memory attributes (used by Linux) */
 #define MAIR_ATTR_DEVICE_nGnRnE		UL(0x00)
 #define MAIR_ATTR_DEVICE_nGnRE		UL(0x04)
-#define MAIR_ATTR_DEVICE_GRE		UL(0x0c)
 #define MAIR_ATTR_NORMAL_NC		UL(0x44)
 #define MAIR_ATTR_NORMAL_WT		UL(0xbb)
 #define MAIR_ATTR_NORMAL_TAGGED		UL(0xf0)
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 97d7bcd8d4f2..add026fcc88c 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -58,7 +58,6 @@
 #define MAIR_EL1_SET							\
 	(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |	\
 	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |	\
-	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |			\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |		\
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index a1937dfff31c..1c403536c9bb 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -157,10 +157,6 @@ static const struct prot_bits pte_bits[] = {
 		.mask	= PTE_ATTRINDX_MASK,
 		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRE),
 		.set	= "DEVICE/nGnRE",
-	}, {
-		.mask	= PTE_ATTRINDX_MASK,
-		.val	= PTE_ATTRINDX(MT_DEVICE_GRE),
-		.set	= "DEVICE/GRE",
 	}, {
 		.mask	= PTE_ATTRINDX_MASK,
 		.val	= PTE_ATTRINDX(MT_NORMAL_NC),
-- 
cgit v1.2.3


From ee67c1103a1b50467969cf2cdb182c096c144459 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 27 May 2021 12:03:18 +0100
Subject: arm64: acpi: Map EFI_MEMORY_WT memory as Normal-NC

The only user we have of Normal Write-Through memory is in the ACPI code
when mapping memory regions advertised as EFI_MEMORY_WT. Since most (all?)
CPUs treat write-through as non-cacheable under the hood, don't bother
with the extra memory type here and just treat EFI_MEMORY_WT the same way
as EFI_MEMORY_WC by mapping it to the Normal-NC memory type instead and
emitting a warning if we have failed to find an alternative EFI memory
type.

Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210527110319.22157-3-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/acpi.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index cada0b816c8a..f3851724fe35 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -239,6 +239,18 @@ done:
 	}
 }
 
+static pgprot_t __acpi_get_writethrough_mem_attribute(void)
+{
+	/*
+	 * Although UEFI specifies the use of Normal Write-through for
+	 * EFI_MEMORY_WT, it is seldom used in practice and not implemented
+	 * by most (all?) CPUs. Rather than allocate a MAIR just for this
+	 * purpose, emit a warning and use Normal Non-cacheable instead.
+	 */
+	pr_warn_once("No MAIR allocation for EFI_MEMORY_WT; treating as Normal Non-cacheable\n");
+	return __pgprot(PROT_NORMAL_NC);
+}
+
 pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 {
 	/*
@@ -246,7 +258,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 	 * types" of UEFI 2.5 section 2.3.6.1, each EFI memory type is
 	 * mapped to a corresponding MAIR attribute encoding.
 	 * The EFI memory attribute advises all possible capabilities
-	 * of a memory region. We use the most efficient capability.
+	 * of a memory region.
 	 */
 
 	u64 attr;
@@ -254,10 +266,10 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 	attr = efi_mem_attributes(addr);
 	if (attr & EFI_MEMORY_WB)
 		return PAGE_KERNEL;
-	if (attr & EFI_MEMORY_WT)
-		return __pgprot(PROT_NORMAL_WT);
 	if (attr & EFI_MEMORY_WC)
 		return __pgprot(PROT_NORMAL_NC);
+	if (attr & EFI_MEMORY_WT)
+		return __acpi_get_writethrough_mem_attribute();
 	return __pgprot(PROT_DEVICE_nGnRnE);
 }
 
@@ -340,10 +352,10 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
 		default:
 			if (region->attribute & EFI_MEMORY_WB)
 				prot = PAGE_KERNEL;
-			else if (region->attribute & EFI_MEMORY_WT)
-				prot = __pgprot(PROT_NORMAL_WT);
 			else if (region->attribute & EFI_MEMORY_WC)
 				prot = __pgprot(PROT_NORMAL_NC);
+			else if (region->attribute & EFI_MEMORY_WT)
+				prot = __acpi_get_writethrough_mem_attribute();
 		}
 	}
 	return __ioremap(phys, size, prot);
-- 
cgit v1.2.3


From 21cfe6edbadb703b674ae2ddf78862d00d24bfc5 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 27 May 2021 12:03:19 +0100
Subject: arm64: mm: Remove unused support for Normal-WT memory type

The Normal-WT memory type is unused, so remove it and reclaim a MAIR.

Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210527110319.22157-4-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/memory.h       | 5 ++---
 arch/arm64/include/asm/pgtable-prot.h | 1 -
 arch/arm64/include/asm/sysreg.h       | 1 -
 arch/arm64/mm/proc.S                  | 1 -
 4 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 1e025e3b655e..7b360960cc35 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -135,9 +135,8 @@
 #define MT_NORMAL		0
 #define MT_NORMAL_TAGGED	1
 #define MT_NORMAL_NC		2
-#define MT_NORMAL_WT		3
-#define MT_DEVICE_nGnRnE	4
-#define MT_DEVICE_nGnRE		5
+#define MT_DEVICE_nGnRnE	3
+#define MT_DEVICE_nGnRE		4
 
 /*
  * Memory types for Stage-2 translation
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 938092df76cf..7032f04c8ac6 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -55,7 +55,6 @@ extern bool arm64_use_ng_mappings;
 #define PROT_DEVICE_nGnRnE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
 #define PROT_DEVICE_nGnRE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
 #define PROT_NORMAL_NC		(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC))
-#define PROT_NORMAL_WT		(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT))
 #define PROT_NORMAL		(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL))
 #define PROT_NORMAL_TAGGED	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED))
 
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index baeb33cd7685..9ea84bcddf85 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -704,7 +704,6 @@
 #define MAIR_ATTR_DEVICE_nGnRnE		UL(0x00)
 #define MAIR_ATTR_DEVICE_nGnRE		UL(0x04)
 #define MAIR_ATTR_NORMAL_NC		UL(0x44)
-#define MAIR_ATTR_NORMAL_WT		UL(0xbb)
 #define MAIR_ATTR_NORMAL_TAGGED		UL(0xf0)
 #define MAIR_ATTR_NORMAL		UL(0xff)
 #define MAIR_ATTR_MASK			UL(0xff)
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index add026fcc88c..6e640fa9788e 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -60,7 +60,6 @@
 	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |	\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |			\
-	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
 
 #ifdef CONFIG_CPU_PM
-- 
cgit v1.2.3


From 65688d2a05deb9f0671a7e2301eadbfe7e27c9e9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 27 May 2021 13:43:56 +0100
Subject: arm64: cache: Lower ARCH_DMA_MINALIGN to 64 (L1_CACHE_BYTES)

Back in 97303480753e ("arm64: Increase the max granular size"),
ARCH_DMA_MINALIGN was effectively increased to 128 bytes thanks to an
increase in L1_CACHE_BYTES due to an unsubstantiated performance claim
on the now obsolete ThunderX-1. Although this was reverted in
d93277b9839b, ARCH_DMA_MINALIGN was kept at 128 bytes by ebc7e21e0fa2
("arm64: Increase ARCH_DMA_MINALIGN to 128").

During discussion of the original patch, it was reported that the change
also prevented a warning during boot on (again, now obsolete) Qualcomm
server hardware where the cache writeback granule was larger than 64
bytes. The reason for this warning was because non-coherent DMA could
lead to data corruption due to unexpected writeback from the CPU where a
cacheline is shared with other allocations.

Since then, systems have appeared with larger cachelines still, and so
commit 8f5c9037a55b ("arm64/mm: Correct the cache line size warning with
non coherent device") reworked the warning so that it only appears on
systems where non-coherent DMA is actually required and taints the
kernel with TAINT_CPU_OUT_OF_SPEC. We are not aware of any systems, even
including the aforementioned obsolete machines, which have a CWG larger
than 64 bytes and require non-coherent DMA.

More recently, it has been reported that a ARCH_DMA_MINALIGN of 128
bytes wastes considerable memory (~6% immediately after boot on one
system).

Reduce ARCH_DMA_MINALIGN to 64 bytes and allow the warning/taint to
indicate if there are machines that unknowingly rely on this.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Vincent Whitchurch <vincent.whitchurch@axis.com>
Link: https://lore.kernel.org/linux-arm-kernel/1442944788-17254-1-git-send-email-rric@kernel.org/
Link: https://lore.kernel.org/linux-arm-kernel/CAOZdJXUiRMAguDV+HEJqPg57MyBNqEcTyaH+ya=U93NHb-pdJA@mail.gmail.com/
Link: https://lore.kernel.org/linux-arm-kernel/20190614131141.4428-1-msys.mizuma@gmail.com/
Link: https://lore.kernel.org/r/20210517074332.28280-1-vincent.whitchurch@axis.com
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210527124356.22367-1-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index a074459f8f2f..a9c0716e7440 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -47,7 +47,7 @@
  * cache before the transfer is done, causing old data to be seen by
  * the CPU.
  */
-#define ARCH_DMA_MINALIGN	(128)
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #ifdef CONFIG_KASAN_SW_TAGS
 #define ARCH_SLAB_MINALIGN	(1ULL << KASAN_SHADOW_SCALE_SHIFT)
-- 
cgit v1.2.3


From 3c1f2eb5475a4031d9555a38de2467d80019c66a Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Wed, 2 Jun 2021 09:00:41 +0800
Subject: arm_pmu: move to use request_irq by IRQF_NO_AUTOEN flag

request_irq() after setting IRQ_NOAUTOEN as below
irq_set_status_flags(irq, IRQ_NOAUTOEN);
request_irq(dev, irq...);
can be replaced by request_irq() with IRQF_NO_AUTOEN flag.

this patch is made base on "add IRQF_NO_AUTOEN for request_irq" which
is being merged: https://lore.kernel.org/patchwork/patch/1388765/

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1622595642-61678-2-git-send-email-tiantao6@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index a64e254a731b..3cbc3baf087f 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -644,11 +644,9 @@ int armpmu_request_irq(int irq, int cpu)
 		}
 
 		irq_flags = IRQF_PERCPU |
-			    IRQF_NOBALANCING |
+			    IRQF_NOBALANCING | IRQF_NO_AUTOEN |
 			    IRQF_NO_THREAD;
 
-		irq_set_status_flags(irq, IRQ_NOAUTOEN);
-
 		err = request_nmi(irq, handler, irq_flags, "arm-pmu",
 				  per_cpu_ptr(&cpu_armpmu, cpu));
 
-- 
cgit v1.2.3


From 0d0f144a8f5f9815a180d16ef7d08b6269016897 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Wed, 2 Jun 2021 09:00:42 +0800
Subject: perf: qcom_l2_pmu: move to use request_irq by IRQF_NO_AUTOEN flag

request_irq() after setting IRQ_NOAUTOEN as below
irq_set_status_flags(irq, IRQ_NOAUTOEN); request_irq(dev, irq...); can
be replaced by request_irq() with IRQF_NO_AUTOEN flag.

this patch is made base on "add IRQF_NO_AUTOEN for request_irq" which
is being merged: https://lore.kernel.org/patchwork/patch/1388765/

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1622595642-61678-3-git-send-email-tiantao6@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/qcom_l2_pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index fc54a80f9c5c..b60e30141583 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -869,14 +869,14 @@ static int l2_cache_pmu_probe_cluster(struct device *dev, void *data)
 	irq = platform_get_irq(sdev, 0);
 	if (irq < 0)
 		return irq;
-	irq_set_status_flags(irq, IRQ_NOAUTOEN);
 	cluster->irq = irq;
 
 	cluster->l2cache_pmu = l2cache_pmu;
 	cluster->on_cpu = -1;
 
 	err = devm_request_irq(&pdev->dev, irq, l2_cache_handle_irq,
-			       IRQF_NOBALANCING | IRQF_NO_THREAD,
+			       IRQF_NOBALANCING | IRQF_NO_THREAD |
+			       IRQF_NO_AUTOEN,
 			       "l2-cache-pmu", cluster);
 	if (err) {
 		dev_err(&pdev->dev,
-- 
cgit v1.2.3


From 6b8f648959e5036695f056a60e3444f4753f643e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 2 Jun 2021 16:13:58 +0100
Subject: arm64: update string routine copyrights and URLs

To make future archaeology easier, let's have the string routine comment
blocks encode the specific upstream commit ID they were imported from.
These are the same commit IDs as listed in the commits importing the
code, expanded to 16 characters. Note that the routines have different
commit IDs, each reprsenting the latest upstream commit which changed
the particular routine.

At the same time, let's consistently include 2021 in the copyright
dates.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210602151358.35571-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/memcmp.S  | 4 ++--
 arch/arm64/lib/memcpy.S  | 6 +++---
 arch/arm64/lib/strcmp.S  | 4 ++--
 arch/arm64/lib/strlen.S  | 4 ++--
 arch/arm64/lib/strncmp.S | 4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index 498f0d9941d9..7d956384222f 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  *
  * Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
  */
 
 #include <linux/linkage.h>
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 31073a8304fb..b82fd64ee1e1 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  *
  * Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
  */
 
 #include <linux/linkage.h>
@@ -249,4 +249,4 @@ EXPORT_SYMBOL(__memcpy)
 SYM_FUNC_END_ALIAS_PI(memmove)
 EXPORT_SYMBOL(memmove)
 SYM_FUNC_END_ALIAS(__memmove)
-EXPORT_SYMBOL(__memmove)
\ No newline at end of file
+EXPORT_SYMBOL(__memmove)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index e82ccb6c2f93..d7bee210a798 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  *
  * Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strcmp.S
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S
  */
 
 #include <linux/linkage.h>
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index b557185b54a5..35fbdb7d6e1a 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  *
  * Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strlen.S
+ * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
  */
 
 #include <linux/linkage.h>
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index 0c0bf5462de0..48d44f7fddb1 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  *
  * Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strncmp.S
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
  */
 
 #include <linux/linkage.h>
-- 
cgit v1.2.3


From 281e44f5fd4f82d86a2b86f0592c698f7311a674 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Thu, 3 Jun 2021 15:15:02 +0800
Subject: arm64: perf: Add more support on caps under sysfs

Armv8.7 has introduced BUS_SLOTS and BUS_WIDTH in PMMIR_EL1 register,
add two entries in caps for bus_slots and bus_width under sysfs. It
will return the true slots and width if the information is available,
otherwise it will return 0.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/1622704502-63951-1-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/perf_event.h |  5 +++++
 arch/arm64/kernel/perf_event.c      | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 60731f602d3e..4ef6f19331f9 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -239,6 +239,11 @@
 /* PMMIR_EL1.SLOTS mask */
 #define ARMV8_PMU_SLOTS_MASK	0xff
 
+#define ARMV8_PMU_BUS_SLOTS_SHIFT 8
+#define ARMV8_PMU_BUS_SLOTS_MASK 0xff
+#define ARMV8_PMU_BUS_WIDTH_SHIFT 16
+#define ARMV8_PMU_BUS_WIDTH_MASK 0xf
+
 #ifdef CONFIG_PERF_EVENTS
 struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 44b6eda69a81..a661010308c0 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -317,8 +317,41 @@ static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR_RO(slots);
 
+static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT)
+			& ARMV8_PMU_BUS_SLOTS_MASK;
+
+	return sysfs_emit(page, "0x%08x\n", bus_slots);
+}
+
+static DEVICE_ATTR_RO(bus_slots);
+
+static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT)
+			& ARMV8_PMU_BUS_WIDTH_MASK;
+	u32 val = 0;
+
+	/* Encoded as Log2(number of bytes), plus one */
+	if (bus_width > 2 && bus_width < 13)
+		val = 1 << (bus_width - 1);
+
+	return sysfs_emit(page, "0x%08x\n", val);
+}
+
+static DEVICE_ATTR_RO(bus_width);
+
 static struct attribute *armv8_pmuv3_caps_attrs[] = {
 	&dev_attr_slots.attr,
+	&dev_attr_bus_slots.attr,
+	&dev_attr_bus_width.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From fcf9dc02f83949b3261eefe03e7bb81c59bfaa9c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 3 Jun 2021 20:02:39 +0800
Subject: arm64: mm: Add is_el1_data_abort() helper

We alread have is_el1_instruction_abort(), add is_el1_data_abort()
helper and use it.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210603120239.169018-1-wangkefeng.wang@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/fault.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 871c82ab0a30..5c855b2ab93b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -232,13 +232,17 @@ static bool is_el1_instruction_abort(unsigned int esr)
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
 }
 
+static bool is_el1_data_abort(unsigned int esr)
+{
+	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
+}
+
 static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
 					   struct pt_regs *regs)
 {
-	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+	if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
 		return false;
 
 	if (fsc_type == ESR_ELx_FSC_PERM)
@@ -258,7 +262,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 	unsigned long flags;
 	u64 par, dfsc;
 
-	if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR ||
+	if (!is_el1_data_abort(esr) ||
 	    (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
 		return false;
 
@@ -346,10 +350,9 @@ static void do_tag_recovery(unsigned long addr, unsigned int esr,
 
 static bool is_el1_mte_sync_tag_check_fault(unsigned int esr)
 {
-	unsigned int ec = ESR_ELx_EC(esr);
 	unsigned int fsc = esr & ESR_ELx_FSC;
 
-	if (ec != ESR_ELx_EC_DABT_CUR)
+	if (!is_el1_data_abort(esr))
 		return false;
 
 	if (fsc == ESR_ELx_FSC_MTE)
-- 
cgit v1.2.3


From 814be609baae62aaa6c02fa6f3ad66cff32a6d15 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Thu, 3 Jun 2021 16:34:51 +0800
Subject: drivers/perf: hisi: Fix data source control

'Data source' is a new function for HHA PMU and config / clear
interface was wrong by mistake. 'HHA_DATSRC_CTRL' register is
mainly used for data source configuration, if we enable bit0
as driver, it will go on count the event and we didn't check
it carefully. So fix the issue and do as the initial purpose.

Fixes: 932f6a99f9b0 ("drivers/perf: hisi: Add new functions for HHA PMU")
Reported-by: kernel test robot <lkp@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/1622709291-37996-1-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 12b2c5e6d488..393513150106 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -90,7 +90,7 @@ static void hisi_hha_pmu_config_ds(struct perf_event *event)
 
 		val = readl(hha_pmu->base + HHA_DATSRC_CTRL);
 		val |= HHA_DATSRC_SKT_EN;
-		writel(ds_skt, hha_pmu->base + HHA_DATSRC_CTRL);
+		writel(val, hha_pmu->base + HHA_DATSRC_CTRL);
 	}
 }
 
@@ -104,7 +104,7 @@ static void hisi_hha_pmu_clear_ds(struct perf_event *event)
 
 		val = readl(hha_pmu->base + HHA_DATSRC_CTRL);
 		val &= ~HHA_DATSRC_SKT_EN;
-		writel(ds_skt, hha_pmu->base + HHA_DATSRC_CTRL);
+		writel(val, hha_pmu->base + HHA_DATSRC_CTRL);
 	}
 }
 
-- 
cgit v1.2.3


From 92638b4e1b47f97d7269e74465dedf73096f777d Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:27 -0700
Subject: mm: arch: remove indirection level in
 alloc_zeroed_user_highpage_movable()

In an upcoming change we would like to add a flag to
GFP_HIGHUSER_MOVABLE so that it would no longer be an OR
of GFP_HIGHUSER and __GFP_MOVABLE. This poses a problem for
alloc_zeroed_user_highpage_movable() which passes __GFP_MOVABLE
into an arch-specific __alloc_zeroed_user_highpage() hook which ORs
in GFP_HIGHUSER.

Since __alloc_zeroed_user_highpage() is only ever called from
alloc_zeroed_user_highpage_movable(), we can remove one level
of indirection here. Remove __alloc_zeroed_user_highpage(),
make alloc_zeroed_user_highpage_movable() the hook, and use
GFP_HIGHUSER_MOVABLE in the hook implementations so that they will
pick up the new flag that we are going to add.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/Ic6361c657b2cdcd896adbe0cf7cb5a7fbb1ed7bf
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210602235230.3928842-2-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/alpha/include/asm/page.h   |  6 +++---
 arch/arm64/include/asm/page.h   |  6 +++---
 arch/ia64/include/asm/page.h    |  6 +++---
 arch/m68k/include/asm/page_no.h |  6 +++---
 arch/s390/include/asm/page.h    |  6 +++---
 arch/x86/include/asm/page.h     |  6 +++---
 include/linux/highmem.h         | 35 ++++++++---------------------------
 7 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h
index 268f99b4602b..18f48a6f2ff6 100644
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -17,9 +17,9 @@
 extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vmaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 extern void copy_page(void * _to, void * _from);
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 012cffc574e8..e1fc0f60e79f 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -28,9 +28,9 @@ void copy_user_highpage(struct page *to, struct page *from,
 void copy_highpage(struct page *to, struct page *from);
 #define __HAVE_ARCH_COPY_HIGHPAGE
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index f4dc81fa7146..1b990466d540 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -82,16 +82,16 @@ do {						\
 } while (0)
 
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr)		\
+#define alloc_zeroed_user_highpage_movable(vma, vaddr)			\
 ({									\
 	struct page *page = alloc_page_vma(				\
-		GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr);	\
+		GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr);		\
 	if (page)							\
  		flush_dcache_page(page);				\
 	page;								\
 })
 
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h
index 8d0f862ee9d7..c9d0d84158a4 100644
--- a/arch/m68k/include/asm/page_no.h
+++ b/arch/m68k/include/asm/page_no.h
@@ -13,9 +13,9 @@ extern unsigned long memory_end;
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #define __pa(vaddr)		((unsigned long)(vaddr))
 #define __va(paddr)		((void *)((unsigned long)(paddr)))
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index cc98f9b78fd4..479dc76e0eca 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -68,9 +68,9 @@ static inline void copy_page(void *to, void *from)
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 /*
  * These are used to make use of C type-checking..
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 7555b48803a8..4d5810c8fab7 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -34,9 +34,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 	copy_page(to, from);
 }
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #ifndef __pa
 #define __pa(x)		__phys_addr((unsigned long)(x))
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 832b49b50c7b..54d0643b8fcf 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -152,28 +152,24 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 }
 #endif
 
-#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 /**
- * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags
- * @movableflags: The GFP flags related to the pages future ability to move like __GFP_MOVABLE
+ * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
  * @vma: The VMA the page is to be allocated for
  * @vaddr: The virtual address the page will be inserted into
  *
- * This function will allocate a page for a VMA but the caller is expected
- * to specify via movableflags whether the page will be movable in the
- * future or not
+ * This function will allocate a page for a VMA that the caller knows will
+ * be able to migrate in the future using move_pages() or reclaimed
  *
  * An architecture may override this function by defining
- * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and providing their own
+ * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own
  * implementation.
  */
 static inline struct page *
-__alloc_zeroed_user_highpage(gfp_t movableflags,
-			struct vm_area_struct *vma,
-			unsigned long vaddr)
+alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+				   unsigned long vaddr)
 {
-	struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
-			vma, vaddr);
+	struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
 
 	if (page)
 		clear_user_highpage(page, vaddr);
@@ -182,21 +178,6 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
 }
 #endif
 
-/**
- * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
- * @vma: The VMA the page is to be allocated for
- * @vaddr: The virtual address the page will be inserted into
- *
- * This function will allocate a page for a VMA that the caller knows will
- * be able to migrate in the future using move_pages() or reclaimed
- */
-static inline struct page *
-alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
-					unsigned long vaddr)
-{
-	return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
-}
-
 static inline void clear_highpage(struct page *page)
 {
 	void *kaddr = kmap_atomic(page);
-- 
cgit v1.2.3


From 7a3b835371883558eb63e069d891bd87f562380d Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:28 -0700
Subject: kasan: use separate (un)poison implementation for integrated init

Currently with integrated init page_alloc.c needs to know whether
kasan_alloc_pages() will zero initialize memory, but this will start
becoming more complicated once we start adding tag initialization
support for user pages. To avoid page_alloc.c needing to know more
details of what integrated init will do, move the unpoisoning logic
for integrated init into the HW tags implementation. Currently the
logic is identical but it will diverge in subsequent patches.

For symmetry do the same for poisoning although this logic will
be unaffected by subsequent patches.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://linux-review.googlesource.com/id/I2c550234c6c4a893c48c18ff0c6ce658c7c67056
Link: https://lore.kernel.org/r/20210602235230.3928842-3-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/kasan.h | 64 ++++++++++++++++++++++++++++++---------------------
 mm/kasan/common.c     |  4 ++--
 mm/kasan/hw_tags.c    | 22 ++++++++++++++++++
 mm/mempool.c          |  6 +++--
 mm/page_alloc.c       | 55 ++++++++++++++++++++++---------------------
 5 files changed, 95 insertions(+), 56 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b1678a61e6a7..a1c7ce5f3e4f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/bug.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
 
@@ -79,14 +80,6 @@ static inline void kasan_disable_current(void) {}
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
-#ifdef CONFIG_KASAN
-
-struct kasan_cache {
-	int alloc_meta_offset;
-	int free_meta_offset;
-	bool is_kmalloc;
-};
-
 #ifdef CONFIG_KASAN_HW_TAGS
 
 DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
@@ -101,11 +94,14 @@ static inline bool kasan_has_integrated_init(void)
 	return kasan_enabled();
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags);
+void kasan_free_pages(struct page *page, unsigned int order);
+
 #else /* CONFIG_KASAN_HW_TAGS */
 
 static inline bool kasan_enabled(void)
 {
-	return true;
+	return IS_ENABLED(CONFIG_KASAN);
 }
 
 static inline bool kasan_has_integrated_init(void)
@@ -113,8 +109,30 @@ static inline bool kasan_has_integrated_init(void)
 	return false;
 }
 
+static __always_inline void kasan_alloc_pages(struct page *page,
+					      unsigned int order, gfp_t flags)
+{
+	/* Only available for integrated init. */
+	BUILD_BUG();
+}
+
+static __always_inline void kasan_free_pages(struct page *page,
+					     unsigned int order)
+{
+	/* Only available for integrated init. */
+	BUILD_BUG();
+}
+
 #endif /* CONFIG_KASAN_HW_TAGS */
 
+#ifdef CONFIG_KASAN
+
+struct kasan_cache {
+	int alloc_meta_offset;
+	int free_meta_offset;
+	bool is_kmalloc;
+};
+
 slab_flags_t __kasan_never_merge(void);
 static __always_inline slab_flags_t kasan_never_merge(void)
 {
@@ -130,20 +148,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
 		__kasan_unpoison_range(addr, size);
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_alloc_pages(struct page *page,
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline void kasan_poison_pages(struct page *page,
 						unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_alloc_pages(page, order, init);
+		__kasan_poison_pages(page, order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_free_pages(struct page *page,
-						unsigned int order, bool init)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline void kasan_unpoison_pages(struct page *page,
+						 unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_free_pages(page, order, init);
+		__kasan_unpoison_pages(page, order, init);
 }
 
 void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
@@ -285,21 +303,15 @@ void kasan_restore_multi_shot(bool enabled);
 
 #else /* CONFIG_KASAN */
 
-static inline bool kasan_enabled(void)
-{
-	return false;
-}
-static inline bool kasan_has_integrated_init(void)
-{
-	return false;
-}
 static inline slab_flags_t kasan_never_merge(void)
 {
 	return 0;
 }
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
-static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {}
-static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {}
+static inline void kasan_poison_pages(struct page *page, unsigned int order,
+				      bool init) {}
+static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
+					bool init) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6bb87f2acd4e..0ecd293af344 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void)
 	return 0;
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
 {
 	u8 tag;
 	unsigned long i;
@@ -111,7 +111,7 @@ void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
 	kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order, bool init)
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
 {
 	if (likely(!PageHighMem(page)))
 		kasan_poison(page_address(page), PAGE_SIZE << order,
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 4004388b4e4b..9d0f6f934016 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -238,6 +238,28 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
 	return &alloc_meta->free_track[0];
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+{
+	/*
+	 * This condition should match the one in post_alloc_hook() in
+	 * page_alloc.c.
+	 */
+	bool init = !want_init_on_free() && want_init_on_alloc(flags);
+
+	kasan_unpoison_pages(page, order, init);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	/*
+	 * This condition should match the one in free_pages_prepare() in
+	 * page_alloc.c.
+	 */
+	bool init = want_init_on_free();
+
+	kasan_poison_pages(page, order, init);
+}
+
 #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
 
 void kasan_set_tagging_report_once(bool state)
diff --git a/mm/mempool.c b/mm/mempool.c
index a258cf4de575..0b8afbec3e35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,8 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_slab_free_mempool(element);
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_free_pages(element, (unsigned long)pool->pool_data, false);
+		kasan_poison_pages(element, (unsigned long)pool->pool_data,
+				   false);
 }
 
 static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +115,8 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_unpoison_range(element, __ksize(element));
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
+		kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
+				     false);
 }
 
 static __always_inline void add_element(mempool_t *pool, void *element)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aaa1655cf682..4fddb7cac3c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -382,7 +382,7 @@ int page_group_by_mobility_disabled __read_mostly;
 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
 
 /*
- * Calling kasan_free_pages() only after deferred memory initialization
+ * Calling kasan_poison_pages() only after deferred memory initialization
  * has completed. Poisoning pages during deferred memory init will greatly
  * lengthen the process and cause problem in large memory systems as the
  * deferred pages initialization is done with interrupt disabled.
@@ -394,15 +394,11 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
  * on-demand allocation and then freed again before the deferred pages
  * initialization is done, but this is not likely to happen.
  */
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-						bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
 {
-	if (static_branch_unlikely(&deferred_pages))
-		return;
-	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-			(fpi_flags & FPI_SKIP_KASAN_POISON))
-		return;
-	kasan_free_pages(page, order, init);
+	return static_branch_unlikely(&deferred_pages) ||
+	       (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+		(fpi_flags & FPI_SKIP_KASAN_POISON));
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -453,13 +449,10 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 	return false;
 }
 #else
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-						bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
 {
-	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-			(fpi_flags & FPI_SKIP_KASAN_POISON))
-		return;
-	kasan_free_pages(page, order, init);
+	return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+		(fpi_flags & FPI_SKIP_KASAN_POISON));
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1245,7 +1238,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order, bool check_free, fpi_t fpi_flags)
 {
 	int bad = 0;
-	bool init;
+	bool skip_kasan_poison = should_skip_kasan_poison(fpi_flags);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1314,10 +1307,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	 * With hardware tag-based KASAN, memory tags must be set before the
 	 * page becomes unavailable via debug_pagealloc or arch_free_page.
 	 */
-	init = want_init_on_free();
-	if (init && !kasan_has_integrated_init())
-		kernel_init_free_pages(page, 1 << order);
-	kasan_free_nondeferred_pages(page, order, init, fpi_flags);
+	if (kasan_has_integrated_init()) {
+		if (!skip_kasan_poison)
+			kasan_free_pages(page, order);
+	} else {
+		bool init = want_init_on_free();
+
+		if (init)
+			kernel_init_free_pages(page, 1 << order);
+		if (!skip_kasan_poison)
+			kasan_poison_pages(page, order, init);
+	}
 
 	/*
 	 * arch_free_page() can make the page's contents inaccessible.  s390
@@ -2324,8 +2324,6 @@ static bool check_new_pages(struct page *page, unsigned int order)
 inline void post_alloc_hook(struct page *page, unsigned int order,
 				gfp_t gfp_flags)
 {
-	bool init;
-
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 
@@ -2344,10 +2342,15 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	 * kasan_alloc_pages and kernel_init_free_pages must be
 	 * kept together to avoid discrepancies in behavior.
 	 */
-	init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
-	kasan_alloc_pages(page, order, init);
-	if (init && !kasan_has_integrated_init())
-		kernel_init_free_pages(page, 1 << order);
+	if (kasan_has_integrated_init()) {
+		kasan_alloc_pages(page, order, gfp_flags);
+	} else {
+		bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+
+		kasan_unpoison_pages(page, order, init);
+		if (init)
+			kernel_init_free_pages(page, 1 << order);
+	}
 
 	set_page_owner(page, order, gfp_flags);
 }
-- 
cgit v1.2.3


From 013bb59dbb7cf876449df860946458a595a96d51 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:29 -0700
Subject: arm64: mte: handle tags zeroing at page allocation time

Currently, on an anonymous page fault, the kernel allocates a zeroed
page and maps it in user space. If the mapping is tagged (PROT_MTE),
set_pte_at() additionally clears the tags. It is, however, more
efficient to clear the tags at the same time as zeroing the data on
allocation. To avoid clearing the tags on any page (which may not be
mapped as tagged), only do this if the vma flags contain VM_MTE. This
requires introducing a new GFP flag that is used to determine whether
to clear the tags.

The DC GZVA instruction with a 0 top byte (and 0 tag) requires
top-byte-ignore. Set the TCR_EL1.{TBI1,TBID1} bits irrespective of
whether KASAN_HW is enabled.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://linux-review.googlesource.com/id/Id46dc94e30fe11474f7e54f5d65e7658dbdddb26
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20210602235230.3928842-4-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/mte.h  |  4 ++++
 arch/arm64/include/asm/page.h |  8 ++++++--
 arch/arm64/lib/mte.S          | 20 ++++++++++++++++++++
 arch/arm64/mm/fault.c         | 26 ++++++++++++++++++++++++++
 arch/arm64/mm/proc.S          | 10 +++++++---
 include/linux/gfp.h           |  9 +++++++--
 include/linux/highmem.h       |  8 ++++++++
 mm/kasan/hw_tags.c            |  9 ++++++++-
 mm/page_alloc.c               | 13 ++++++++++---
 9 files changed, 96 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index bc88a1ced0d7..67bf259ae768 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -37,6 +37,7 @@ void mte_free_tag_storage(char *storage);
 /* track which pages have valid allocation tags */
 #define PG_mte_tagged	PG_arch_2
 
+void mte_zero_clear_page_tags(void *addr);
 void mte_sync_tags(pte_t *ptep, pte_t pte);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void mte_thread_init_user(void);
@@ -53,6 +54,9 @@ int mte_ptrace_copy_tags(struct task_struct *child, long request,
 /* unused if !CONFIG_ARM64_MTE, silence the compiler */
 #define PG_mte_tagged	0
 
+static inline void mte_zero_clear_page_tags(void *addr)
+{
+}
 static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
 {
 }
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index e1fc0f60e79f..ed1b9dcf12b2 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -13,6 +13,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/personality.h> /* for READ_IMPLIES_EXEC */
+#include <linux/types.h> /* for gfp_t */
 #include <asm/pgtable-types.h>
 
 struct page;
@@ -28,10 +29,13 @@ void copy_user_highpage(struct page *to, struct page *from,
 void copy_highpage(struct page *to, struct page *from);
 #define __HAVE_ARCH_COPY_HIGHPAGE
 
-#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+						unsigned long vaddr);
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
+void tag_clear_highpage(struct page *to);
+#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 351537c12f36..e83643b3995f 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -36,6 +36,26 @@ SYM_FUNC_START(mte_clear_page_tags)
 	ret
 SYM_FUNC_END(mte_clear_page_tags)
 
+/*
+ * Zero the page and tags at the same time
+ *
+ * Parameters:
+ *	x0 - address to the beginning of the page
+ */
+SYM_FUNC_START(mte_zero_clear_page_tags)
+	mrs	x1, dczid_el0
+	and	w1, w1, #0xf
+	mov	x2, #4
+	lsl	x1, x2, x1
+	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
+
+1:	dc	gzva, x0
+	add	x0, x0, x1
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	1b
+	ret
+SYM_FUNC_END(mte_zero_clear_page_tags)
+
 /*
  * Copy the tags from the source page to the destination one
  *   x0 - address of the destination page
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 871c82ab0a30..180c0343d82a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -921,3 +921,29 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 	debug_exception_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug_exception);
+
+/*
+ * Used during anonymous page fault handling.
+ */
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+						unsigned long vaddr)
+{
+	gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
+
+	/*
+	 * If the page is mapped with PROT_MTE, initialise the tags at the
+	 * point of allocation and page zeroing as this is usually faster than
+	 * separate DC ZVA and STGM.
+	 */
+	if (vma->vm_flags & VM_MTE)
+		flags |= __GFP_ZEROTAGS;
+
+	return alloc_page_vma(flags, vma, vaddr);
+}
+
+void tag_clear_highpage(struct page *page)
+{
+	mte_zero_clear_page_tags(page_address(page));
+	page_kasan_tag_reset(page);
+	set_bit(PG_mte_tagged, &page->flags);
+}
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 97d7bcd8d4f2..48fd1df3d05a 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -46,9 +46,13 @@
 #endif
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
+#define TCR_MTE_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
 #else
-#define TCR_KASAN_HW_FLAGS 0
+/*
+ * The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on
+ * TBI being enabled at EL1.
+ */
+#define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1
 #endif
 
 /*
@@ -464,7 +468,7 @@ SYM_FUNC_START(__cpu_setup)
 	msr_s	SYS_TFSRE0_EL1, xzr
 
 	/* set the TCR_EL1 bits */
-	mov_q	x10, TCR_KASAN_HW_FLAGS
+	mov_q	x10, TCR_MTE_FLAGS
 	orr	tcr, tcr, x10
 1:
 #endif
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 11da8af06704..68ba237365dc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -53,8 +53,9 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x100000u
 #define ___GFP_THISNODE		0x200000u
 #define ___GFP_ACCOUNT		0x400000u
+#define ___GFP_ZEROTAGS		0x800000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x800000u
+#define ___GFP_NOLOCKDEP	0x1000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -229,16 +230,20 @@ struct vm_area_struct;
  * %__GFP_COMP address compound page metadata.
  *
  * %__GFP_ZERO returns a zeroed page on success.
+ *
+ * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
+ * __GFP_ZERO is set.
  */
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
+#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (24 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 54d0643b8fcf..8c6e8e996c87 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -185,6 +185,14 @@ static inline void clear_highpage(struct page *page)
 	kunmap_atomic(kaddr);
 }
 
+#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+
+static inline void tag_clear_highpage(struct page *page)
+{
+}
+
+#endif
+
 /*
  * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
  * If we pass in a head page, we can zero up to the size of the compound page.
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9d0f6f934016..41fd5326ee0a 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -246,7 +246,14 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
 	 */
 	bool init = !want_init_on_free() && want_init_on_alloc(flags);
 
-	kasan_unpoison_pages(page, order, init);
+	if (flags & __GFP_ZEROTAGS) {
+		int i;
+
+		for (i = 0; i != 1 << order; ++i)
+			tag_clear_highpage(page + i);
+	} else {
+		kasan_unpoison_pages(page, order, init);
+	}
 }
 
 void kasan_free_pages(struct page *page, unsigned int order)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4fddb7cac3c6..13937e793fda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1219,10 +1219,16 @@ out:
 	return ret;
 }
 
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
 {
 	int i;
 
+	if (zero_tags) {
+		for (i = 0; i < numpages; i++)
+			tag_clear_highpage(page + i);
+		return;
+	}
+
 	/* s390's use of memset() could override KASAN redzones. */
 	kasan_disable_current();
 	for (i = 0; i < numpages; i++) {
@@ -1314,7 +1320,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 		bool init = want_init_on_free();
 
 		if (init)
-			kernel_init_free_pages(page, 1 << order);
+			kernel_init_free_pages(page, 1 << order, false);
 		if (!skip_kasan_poison)
 			kasan_poison_pages(page, order, init);
 	}
@@ -2349,7 +2355,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
 		kasan_unpoison_pages(page, order, init);
 		if (init)
-			kernel_init_free_pages(page, 1 << order);
+			kernel_init_free_pages(page, 1 << order,
+					       gfp_flags & __GFP_ZEROTAGS);
 	}
 
 	set_page_owner(page, order, gfp_flags);
-- 
cgit v1.2.3


From c275c5c6d50a0518cdb0584e85905d10e7cefc6e Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:30 -0700
Subject: kasan: disable freed user page poisoning with HW tags

Poisoning freed pages protects against kernel use-after-free. The
likelihood of such a bug involving kernel pages is significantly higher
than that for user pages. At the same time, poisoning freed pages can
impose a significant performance cost, which cannot always be justified
for user pages given the lower probability of finding a bug. Therefore,
disable freed user page poisoning when using HW tags. We identify
"user" pages via the flag set GFP_HIGHUSER_MOVABLE, which indicates
a strong likelihood of not being directly accessible to the kernel.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://linux-review.googlesource.com/id/I716846e2de8ef179f44e835770df7e6307be96c9
Link: https://lore.kernel.org/r/20210602235230.3928842-5-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/gfp.h            | 13 ++++++++++---
 include/linux/page-flags.h     |  9 +++++++++
 include/trace/events/mmflags.h |  9 ++++++++-
 mm/kasan/hw_tags.c             |  3 +++
 mm/page_alloc.c                | 12 +++++++-----
 5 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 68ba237365dc..e6102dfa4faa 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -54,8 +54,9 @@ struct vm_area_struct;
 #define ___GFP_THISNODE		0x200000u
 #define ___GFP_ACCOUNT		0x400000u
 #define ___GFP_ZEROTAGS		0x800000u
+#define ___GFP_SKIP_KASAN_POISON	0x1000000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x1000000u
+#define ___GFP_NOLOCKDEP	0x2000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -233,17 +234,22 @@ struct vm_area_struct;
  *
  * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
  * __GFP_ZERO is set.
+ *
+ * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
+ * on deallocation. Typically used for userspace pages. Currently only has an
+ * effect in HW tags mode.
  */
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
 #define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
+#define __GFP_SKIP_KASAN_POISON	((__force gfp_t)___GFP_SKIP_KASAN_POISON)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (24 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
@@ -324,7 +330,8 @@ struct vm_area_struct;
 #define GFP_DMA		__GFP_DMA
 #define GFP_DMA32	__GFP_DMA32
 #define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE)
+#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
+			 __GFP_SKIP_KASAN_POISON)
 #define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
 			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
 #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 04a34c08e0a6..40e2c5000585 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -137,6 +137,9 @@ enum pageflags {
 #endif
 #ifdef CONFIG_64BIT
 	PG_arch_2,
+#endif
+#ifdef CONFIG_KASAN_HW_TAGS
+	PG_skip_kasan_poison,
 #endif
 	__NR_PAGEFLAGS,
 
@@ -443,6 +446,12 @@ TESTCLEARFLAG(Young, young, PF_ANY)
 PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
+#ifdef CONFIG_KASAN_HW_TAGS
+PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
+#else
+PAGEFLAG_FALSE(SkipKASanPoison)
+#endif
+
 /*
  * PageReported() is used to track reported free pages within the Buddy
  * allocator. We can use the non-atomic version of the test and set
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 629c7a0eaff2..390270e00a1d 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -85,6 +85,12 @@
 #define IF_HAVE_PG_ARCH_2(flag,string)
 #endif
 
+#ifdef CONFIG_KASAN_HW_TAGS
+#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string)
+#endif
+
 #define __def_pageflag_names						\
 	{1UL << PG_locked,		"locked"	},		\
 	{1UL << PG_waiters,		"waiters"	},		\
@@ -112,7 +118,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
 IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
 IF_HAVE_PG_IDLE(PG_young,		"young"		)		\
 IF_HAVE_PG_IDLE(PG_idle,		"idle"		)		\
-IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)
+IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)		\
+IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 41fd5326ee0a..ed5e5b833d61 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -246,6 +246,9 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
 	 */
 	bool init = !want_init_on_free() && want_init_on_alloc(flags);
 
+	if (flags & __GFP_SKIP_KASAN_POISON)
+		SetPageSkipKASanPoison(page);
+
 	if (flags & __GFP_ZEROTAGS) {
 		int i;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 13937e793fda..5ad76e540a22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -394,11 +394,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
  * on-demand allocation and then freed again before the deferred pages
  * initialization is done, but this is not likely to happen.
  */
-static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
 	return static_branch_unlikely(&deferred_pages) ||
 	       (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-		(fpi_flags & FPI_SKIP_KASAN_POISON));
+		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+	       PageSkipKASanPoison(page);
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -449,10 +450,11 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 	return false;
 }
 #else
-static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
 	return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-		(fpi_flags & FPI_SKIP_KASAN_POISON));
+		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+	       PageSkipKASanPoison(page);
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1244,7 +1246,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order, bool check_free, fpi_t fpi_flags)
 {
 	int bad = 0;
-	bool skip_kasan_poison = should_skip_kasan_poison(fpi_flags);
+	bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
-- 
cgit v1.2.3


From 382dcdd66ce86491ddd390b39224468c82a47892 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:05 +0100
Subject: arm64: remove redundant local_daif_mask() in bad_mode()

Upon taking an exception, the CPU sets all the DAIF bits. We never
clear any of these bits prior to calling bad_mode(), and bad_mode()
itself never clears any of these bits, so there's no need to call
local_daif_mask().

This patch removes the redundant call.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-2-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/traps.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index a05d34f0e82a..41f0aa92022a 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -765,7 +765,6 @@ asmlinkage void notrace bad_mode(struct pt_regs *regs, int reason, unsigned int
 		esr_get_class_string(esr));
 
 	__show_regs(regs);
-	local_daif_mask();
 	panic("bad mode");
 }
 
-- 
cgit v1.2.3


From f7c706f0391d7894d1ae2d28cb2d5446f5ec59ad Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:06 +0100
Subject: arm64: entry: unmask IRQ+FIQ after EL0 handling

For non-fatal exceptions taken from EL0, we expect that at some point
during exception handling it is possible to return to a regular process
context with all exceptions unmasked (e.g. as we do in
do_notify_resume()), and we generally aim to unmask exceptions wherever
possible.

While handling SError and debug exceptions from EL0, we need to leave
some exceptions masked during handling. Handling SError requires us to
mask SError (which also requires masking IRQ+FIQ), and handing debug
exceptions requires us to mask debug (which also requires masking
SError+IRQ+FIQ).

Once do_serror() or do_debug_exception() has returned, we no longer need
to mask exceptions, and can unmask them all, which is what we did prior
to commit:

  9034f6251572a474 ("arm64: Do not enable IRQs for ct_user_exit")

... where we had to mask IRQs as for context_tracking_user_exit()
expected IRQs to be masked.

Since then, we realised that our context tracking wasn't entirely
correct, and reworked the entry code to fix this. As of commit:

  23529049c6842382 ("arm64: entry: fix non-NMI user<->kernel transitions")

... we replaced the call to context_tracking_user_exit() with a call to
user_exit_irqoff() as part of enter_from_user_mode(), which occurs
earlier, before we run the body of the handler and unmask exceptions in
DAIF.

When we return to userspace, we go via ret_to_user(), which masks
exceptions in DAIF prior to calling user_enter_irqoff() as part of
exit_to_user_mode().

Thus, there's no longer a reason to leave IRQs or FIQs masked at the end
of the EL0 debug or error handlers, as neither the user exit context
tracking nor the user entry context tracking requires this. Let's bring
these into line with other EL0 exception handlers and ensure that IRQ
and FIQ are unmasked in DAIF at some point during the handler.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-3-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-common.c | 2 +-
 arch/arm64/kernel/entry.S        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 340d04e13617..02be1517e08f 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -398,7 +398,7 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
 
 	enter_from_user_mode();
 	do_debug_exception(far, esr, regs);
-	local_daif_restore(DAIF_PROCCTX_NOIRQ);
+	local_daif_restore(DAIF_PROCCTX);
 }
 
 static void noinstr el0_svc(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3513984a88bd..6b2f6f5c5bb8 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -794,7 +794,7 @@ el0_error_naked:
 	mov	x0, sp
 	mov	x1, x25
 	bl	do_serror
-	enable_da
+	enable_daif
 	b	ret_to_user
 SYM_CODE_END(el0_error)
 
-- 
cgit v1.2.3


From bb8e93a287a5f5f10fe7a9d8f612f6105c9622ef Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:07 +0100
Subject: arm64: entry: convert SError handlers to C

For various reasons we'd like to convert the bulk of arm64's exception
triage logic to C. As a step towards that, this patch converts the EL1
and EL0 SError triage logic to C.

Separate C functions are added for the native and compat cases so that
in subsequent patches we can handle native/compat differences in C.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/exception.h |  4 ++++
 arch/arm64/kernel/entry-common.c   | 32 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/entry.S          | 16 +++++-----------
 arch/arm64/kernel/traps.c          |  6 +-----
 4 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 6546158d2f2d..3a859d4e8b59 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,8 +32,11 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void el1_sync_handler(struct pt_regs *regs);
+asmlinkage void el1_error_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_handler(struct pt_regs *regs);
+asmlinkage void el0_error_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_compat_handler(struct pt_regs *regs);
+asmlinkage void el0_error_compat_handler(struct pt_regs *regs);
 
 asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs);
@@ -57,4 +60,5 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs);
 void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
 void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr);
+void do_serror(struct pt_regs *regs, unsigned int esr);
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 02be1517e08f..3b7943721077 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -279,6 +279,16 @@ asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs)
 	}
 }
 
+asmlinkage void noinstr el1_error_handler(struct pt_regs *regs)
+{
+	unsigned long esr = read_sysreg(esr_el1);
+
+	local_daif_restore(DAIF_ERRCTX);
+	arm64_enter_nmi(regs);
+	do_serror(regs, esr);
+	arm64_exit_nmi(regs);
+}
+
 asmlinkage void noinstr enter_from_user_mode(void)
 {
 	lockdep_hardirqs_off(CALLER_ADDR0);
@@ -468,6 +478,23 @@ asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs)
 	}
 }
 
+static void __el0_error_handler_common(struct pt_regs *regs)
+{
+	unsigned long esr = read_sysreg(esr_el1);
+
+	enter_from_user_mode();
+	local_daif_restore(DAIF_ERRCTX);
+	arm64_enter_nmi(regs);
+	do_serror(regs, esr);
+	arm64_exit_nmi(regs);
+	local_daif_restore(DAIF_PROCCTX);
+}
+
+asmlinkage void noinstr el0_error_handler(struct pt_regs *regs)
+{
+	__el0_error_handler_common(regs);
+}
+
 #ifdef CONFIG_COMPAT
 static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr)
 {
@@ -526,4 +553,9 @@ asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs)
 		el0_inv(regs, esr);
 	}
 }
+
+asmlinkage void noinstr el0_error_compat_handler(struct pt_regs *regs)
+{
+	__el0_error_handler_common(regs);
+}
 #endif /* CONFIG_COMPAT */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 6b2f6f5c5bb8..656f3129bfef 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -757,7 +757,9 @@ SYM_CODE_END(el0_fiq_compat)
 
 SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat)
 	kernel_entry 0, 32
-	b	el0_error_naked
+	mov	x0, sp
+	bl	el0_error_compat_handler
+	b	ret_to_user
 SYM_CODE_END(el0_error_compat)
 #endif
 
@@ -778,23 +780,15 @@ SYM_CODE_END(el0_fiq)
 
 SYM_CODE_START_LOCAL(el1_error)
 	kernel_entry 1
-	mrs	x1, esr_el1
-	enable_dbg
 	mov	x0, sp
-	bl	do_serror
+	bl	el1_error_handler
 	kernel_exit 1
 SYM_CODE_END(el1_error)
 
 SYM_CODE_START_LOCAL(el0_error)
 	kernel_entry 0
-el0_error_naked:
-	mrs	x25, esr_el1
-	user_exit_irqoff
-	enable_dbg
 	mov	x0, sp
-	mov	x1, x25
-	bl	do_serror
-	enable_daif
+	bl	el0_error_handler
 	b	ret_to_user
 SYM_CODE_END(el0_error)
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 41f0aa92022a..5fd12d19ef4b 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -869,15 +869,11 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr)
 	}
 }
 
-asmlinkage void noinstr do_serror(struct pt_regs *regs, unsigned int esr)
+void do_serror(struct pt_regs *regs, unsigned int esr)
 {
-	arm64_enter_nmi(regs);
-
 	/* non-RAS errors are not containable */
 	if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr))
 		arm64_serror_panic(regs, esr);
-
-	arm64_exit_nmi(regs);
 }
 
 /* GENERIC_BUG traps */
-- 
cgit v1.2.3


From 33a3581a76f3a36c7dcc9864120ce681bcfbcff1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:08 +0100
Subject: arm64: entry: move arm64_preempt_schedule_irq to entry-common.c

Subsequent patches will pull more of the IRQ entry handling into C. To
keep this in one place, let's move arm64_preempt_schedule_irq() into
entry-common.c along with the other entry management functions.

We no longer need to include <linux/lockdep.h> in process.c, so the
include directive is removed.

There should be no functional change as a result of this patch.

Reviewed-by Joey Gouly <joey.gouly@arm.com>

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-5-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-common.c | 20 ++++++++++++++++++++
 arch/arm64/kernel/process.c      | 17 -----------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3b7943721077..1fe60578e556 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -6,7 +6,11 @@
  */
 
 #include <linux/context_tracking.h>
+#include <linux/linkage.h>
+#include <linux/lockdep.h>
 #include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/thread_info.h>
 
 #include <asm/cpufeature.h>
@@ -113,6 +117,22 @@ asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
 		exit_to_kernel_mode(regs);
 }
 
+asmlinkage void __sched arm64_preempt_schedule_irq(void)
+{
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Preempting a task from an IRQ means we leave copies of PSTATE
+	 * on the stack. cpufeature's enable calls may modify PSTATE, but
+	 * resuming one of these preempted tasks would undo those changes.
+	 *
+	 * Only allow a task to be preempted once cpufeatures have been
+	 * enabled.
+	 */
+	if (system_capabilities_finalized())
+		preempt_schedule_irq();
+}
+
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b4bb67f17a2c..2e7337709155 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -18,7 +18,6 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
-#include <linux/lockdep.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/nospec.h>
@@ -724,22 +723,6 @@ static int __init tagged_addr_init(void)
 core_initcall(tagged_addr_init);
 #endif	/* CONFIG_ARM64_TAGGED_ADDR_ABI */
 
-asmlinkage void __sched arm64_preempt_schedule_irq(void)
-{
-	lockdep_assert_irqs_disabled();
-
-	/*
-	 * Preempting a task from an IRQ means we leave copies of PSTATE
-	 * on the stack. cpufeature's enable calls may modify PSTATE, but
-	 * resuming one of these preempted tasks would undo those changes.
-	 *
-	 * Only allow a task to be preempted once cpufeatures have been
-	 * enabled.
-	 */
-	if (system_capabilities_finalized())
-		preempt_schedule_irq();
-}
-
 #ifdef CONFIG_BINFMT_ELF
 int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state,
 			 bool has_interp, bool is_interp)
-- 
cgit v1.2.3


From 101a5b665dcdff169ae7ad90556604c483d9027e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:09 +0100
Subject: arm64: entry: move NMI preempt logic to C

Currently portions of our preempt logic are written in C while other
parts are written in assembly. Let's clean this up a little bit by
moving the NMI preempt checks to C. For now, the preempt count (and
need_resched) checking is left in assembly, and will be converted
with the body of the IRQ handler in subsequent patches.

Other than the increased lockdep coverage there should be no functional
change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-6-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-common.c |  9 +++++++++
 arch/arm64/kernel/entry.S        | 12 +-----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 1fe60578e556..08d17eb0ce13 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -121,6 +121,15 @@ asmlinkage void __sched arm64_preempt_schedule_irq(void)
 {
 	lockdep_assert_irqs_disabled();
 
+	/*
+	 * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC
+	 * priority masking is used the GIC irqchip driver will clear DAIF.IF
+	 * using gic_arch_enable_irqs() for normal IRQs. If anything is set in
+	 * DAIF we must have handled an NMI, so skip preemption.
+	 */
+	if (system_uses_irq_prio_masking() && read_sysreg(daif))
+		return;
+
 	/*
 	 * Preempting a task from an IRQ means we leave copies of PSTATE
 	 * on the stack. cpufeature's enable calls may modify PSTATE, but
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 656f3129bfef..449628290ce8 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -562,17 +562,7 @@ tsk	.req	x28		// current thread_info
 
 #ifdef CONFIG_PREEMPTION
 	ldr	x24, [tsk, #TSK_TI_PREEMPT]	// get preempt count
-alternative_if ARM64_HAS_IRQ_PRIO_MASKING
-	/*
-	 * DA were cleared at start of handling, and IF are cleared by
-	 * the GIC irqchip driver using gic_arch_enable_irqs() for
-	 * normal IRQs. If anything is set, it means we come back from
-	 * an NMI instead of a normal IRQ, so skip preemption
-	 */
-	mrs	x0, daif
-	orr	x24, x24, x0
-alternative_else_nop_endif
-	cbnz	x24, 1f				// preempt count != 0 || NMI return path
+	cbnz	x24, 1f				// preempt count != 0
 	bl	arm64_preempt_schedule_irq	// irq en/disable is done inside
 1:
 #endif
-- 
cgit v1.2.3


From f8049488e7d37b0a0e438ee449e83b3e46958743 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:10 +0100
Subject: arm64: entry: add a call_on_irq_stack helper

When handling IRQ/FIQ exceptions the entry assembly may transition from
a task's stack to a CPU's IRQ stack (and IRQ shadow call stack).

In subsequent patches we want to migrate the IRQ/FIQ triage logic to C,
and as we want to perform some actions on the task stack (e.g. EL1
preemption), we need to switch stacks within the C handler. So that we
can do so, this patch adds a helper to call a function on a CPU's IRQ
stack (and shadow stack as appropriate).

Subsequent patches will make use of the new helper function.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-7-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/exception.h |  2 ++
 arch/arm64/kernel/entry.S          | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 3a859d4e8b59..c24b69c0c589 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -40,6 +40,8 @@ asmlinkage void el0_error_compat_handler(struct pt_regs *regs);
 
 asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs);
+asmlinkage void call_on_irq_stack(struct pt_regs *regs,
+				  void (*func)(struct pt_regs *));
 asmlinkage void enter_from_user_mode(void);
 asmlinkage void exit_to_user_mode(void);
 void arm64_enter_nmi(struct pt_regs *regs);
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 449628290ce8..8ca74ce115ee 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -982,6 +982,42 @@ SYM_CODE_START(ret_from_fork)
 SYM_CODE_END(ret_from_fork)
 NOKPROBE(ret_from_fork)
 
+/*
+ * void call_on_irq_stack(struct pt_regs *regs,
+ * 		          void (*func)(struct pt_regs *));
+ *
+ * Calls func(regs) using this CPU's irq stack and shadow irq stack.
+ */
+SYM_FUNC_START(call_on_irq_stack)
+#ifdef CONFIG_SHADOW_CALL_STACK
+	stp	scs_sp, xzr, [sp, #-16]!
+	ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x17
+#endif
+	/* Create a frame record to save our LR and SP (implicit in FP) */
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+
+	ldr_this_cpu x16, irq_stack_ptr, x17
+	mov	x15, #IRQ_STACK_SIZE
+	add	x16, x16, x15
+
+	/* Move to the new stack and call the function there */
+	mov	sp, x16
+	blr	x1
+
+	/*
+	 * Restore the SP from the FP, and restore the FP and LR from the frame
+	 * record.
+	 */
+	mov	sp, x29
+	ldp	x29, x30, [sp], #16
+#ifdef CONFIG_SHADOW_CALL_STACK
+	ldp	scs_sp, xzr, [sp], #16
+#endif
+	ret
+SYM_FUNC_END(call_on_irq_stack)
+NOKPROBE(call_on_irq_stack)
+
 #ifdef CONFIG_ARM_SDE_INTERFACE
 
 #include <asm/sdei.h>
-- 
cgit v1.2.3


From 064dbfb4169141943ec7d9dbfd02974dd008f2ce Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:11 +0100
Subject: arm64: entry: convert IRQ+FIQ handlers to C

For various reasons we'd like to convert the bulk of arm64's exception
triage logic to C. As a step towards that, this patch converts the EL1
and EL0 IRQ+FIQ triage logic to C.

Separate C functions are added for the native and compat cases so that
in subsequent patches we can handle native/compat differences in C.

Since the triage functions can now call arm64_apply_bp_hardening()
directly, the do_el0_irq_bp_hardening() wrapper function is removed.

Since the user_exit_irqoff macro is now unused, it is removed. The
user_enter_irqoff macro is still used by the ret_to_user code, and
cannot be removed at this time.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-8-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/exception.h |   8 ++-
 arch/arm64/include/asm/processor.h |   2 -
 arch/arm64/kernel/entry-common.c   |  93 ++++++++++++++++++++++++++++++--
 arch/arm64/kernel/entry.S          | 108 +++++--------------------------------
 arch/arm64/mm/fault.c              |   7 ---
 5 files changed, 110 insertions(+), 108 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index c24b69c0c589..4284ee57a9a5 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,14 +32,18 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void el1_sync_handler(struct pt_regs *regs);
+asmlinkage void el1_irq_handler(struct pt_regs *regs);
+asmlinkage void el1_fiq_handler(struct pt_regs *regs);
 asmlinkage void el1_error_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_handler(struct pt_regs *regs);
+asmlinkage void el0_irq_handler(struct pt_regs *regs);
+asmlinkage void el0_fiq_handler(struct pt_regs *regs);
 asmlinkage void el0_error_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_compat_handler(struct pt_regs *regs);
+asmlinkage void el0_irq_compat_handler(struct pt_regs *regs);
+asmlinkage void el0_fiq_compat_handler(struct pt_regs *regs);
 asmlinkage void el0_error_compat_handler(struct pt_regs *regs);
 
-asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs);
-asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void call_on_irq_stack(struct pt_regs *regs,
 				  void (*func)(struct pt_regs *));
 asmlinkage void enter_from_user_mode(void);
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 9df3feeee890..2f21c76324bb 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -257,8 +257,6 @@ void set_task_sctlr_el1(u64 sctlr);
 extern struct task_struct *cpu_switch_to(struct task_struct *prev,
 					 struct task_struct *next);
 
-asmlinkage void arm64_preempt_schedule_irq(void);
-
 #define task_pt_regs(p) \
 	((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1)
 
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 08d17eb0ce13..ae1b6d7c00e1 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -19,6 +19,8 @@
 #include <asm/exception.h>
 #include <asm/kprobes.h>
 #include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/stacktrace.h>
 #include <asm/sysreg.h>
 
 /*
@@ -101,7 +103,7 @@ void noinstr arm64_exit_nmi(struct pt_regs *regs)
 	__nmi_exit();
 }
 
-asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
+static void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
 {
 	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
 		arm64_enter_nmi(regs);
@@ -109,7 +111,7 @@ asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
 		enter_from_kernel_mode(regs);
 }
 
-asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
+static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
 {
 	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
 		arm64_exit_nmi(regs);
@@ -117,7 +119,7 @@ asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
 		exit_to_kernel_mode(regs);
 }
 
-asmlinkage void __sched arm64_preempt_schedule_irq(void)
+static void __sched arm64_preempt_schedule_irq(void)
 {
 	lockdep_assert_irqs_disabled();
 
@@ -142,6 +144,18 @@ asmlinkage void __sched arm64_preempt_schedule_irq(void)
 		preempt_schedule_irq();
 }
 
+static void do_interrupt_handler(struct pt_regs *regs,
+				 void (*handler)(struct pt_regs *))
+{
+	if (on_thread_stack())
+		call_on_irq_stack(regs, handler);
+	else
+		handler(regs);
+}
+
+extern void (*handle_arch_irq)(struct pt_regs *);
+extern void (*handle_arch_fiq)(struct pt_regs *);
+
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 
@@ -308,6 +322,36 @@ asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs)
 	}
 }
 
+static void noinstr el1_interrupt(struct pt_regs *regs,
+				  void (*handler)(struct pt_regs *))
+{
+	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
+
+	enter_el1_irq_or_nmi(regs);
+	do_interrupt_handler(regs, handler);
+
+	/*
+	 * Note: thread_info::preempt_count includes both thread_info::count
+	 * and thread_info::need_resched, and is not equivalent to
+	 * preempt_count().
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION) &&
+	    READ_ONCE(current_thread_info()->preempt_count) == 0)
+		arm64_preempt_schedule_irq();
+
+	exit_el1_irq_or_nmi(regs);
+}
+
+asmlinkage void noinstr el1_irq_handler(struct pt_regs *regs)
+{
+	el1_interrupt(regs, handle_arch_irq);
+}
+
+asmlinkage void noinstr el1_fiq_handler(struct pt_regs *regs)
+{
+	el1_interrupt(regs, handle_arch_fiq);
+}
+
 asmlinkage void noinstr el1_error_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
@@ -507,6 +551,39 @@ asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs)
 	}
 }
 
+static void noinstr el0_interrupt(struct pt_regs *regs,
+				  void (*handler)(struct pt_regs *))
+{
+	enter_from_user_mode();
+
+	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
+
+	if (regs->pc & BIT(55))
+		arm64_apply_bp_hardening();
+
+	do_interrupt_handler(regs, handler);
+}
+
+static void noinstr __el0_irq_handler_common(struct pt_regs *regs)
+{
+	el0_interrupt(regs, handle_arch_irq);
+}
+
+asmlinkage void noinstr el0_irq_handler(struct pt_regs *regs)
+{
+	__el0_irq_handler_common(regs);
+}
+
+static void noinstr __el0_fiq_handler_common(struct pt_regs *regs)
+{
+	el0_interrupt(regs, handle_arch_fiq);
+}
+
+asmlinkage void noinstr el0_fiq_handler(struct pt_regs *regs)
+{
+	__el0_fiq_handler_common(regs);
+}
+
 static void __el0_error_handler_common(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
@@ -583,6 +660,16 @@ asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs)
 	}
 }
 
+asmlinkage void noinstr el0_irq_compat_handler(struct pt_regs *regs)
+{
+	__el0_irq_handler_common(regs);
+}
+
+asmlinkage void noinstr el0_fiq_compat_handler(struct pt_regs *regs)
+{
+	__el0_fiq_handler_common(regs);
+}
+
 asmlinkage void noinstr el0_error_compat_handler(struct pt_regs *regs)
 {
 	__el0_error_handler_common(regs);
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 8ca74ce115ee..8eb3a0a51413 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -33,12 +33,6 @@
  * Context tracking and irqflag tracing need to instrument transitions between
  * user and kernel mode.
  */
-	.macro user_exit_irqoff
-#if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS)
-	bl	enter_from_user_mode
-#endif
-	.endm
-
 	.macro user_enter_irqoff
 #if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS)
 	bl	exit_to_user_mode
@@ -486,63 +480,12 @@ SYM_CODE_START_LOCAL(__swpan_exit_el0)
 SYM_CODE_END(__swpan_exit_el0)
 #endif
 
-	.macro	irq_stack_entry
-	mov	x19, sp			// preserve the original sp
-#ifdef CONFIG_SHADOW_CALL_STACK
-	mov	x24, scs_sp		// preserve the original shadow stack
-#endif
-
-	/*
-	 * Compare sp with the base of the task stack.
-	 * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
-	 * and should switch to the irq stack.
-	 */
-	ldr	x25, [tsk, TSK_STACK]
-	eor	x25, x25, x19
-	and	x25, x25, #~(THREAD_SIZE - 1)
-	cbnz	x25, 9998f
-
-	ldr_this_cpu x25, irq_stack_ptr, x26
-	mov	x26, #IRQ_STACK_SIZE
-	add	x26, x25, x26
-
-	/* switch to the irq stack */
-	mov	sp, x26
-
-#ifdef CONFIG_SHADOW_CALL_STACK
-	/* also switch to the irq shadow stack */
-	ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x26
-#endif
-
-9998:
-	.endm
-
-	/*
-	 * The callee-saved regs (x19-x29) should be preserved between
-	 * irq_stack_entry and irq_stack_exit, but note that kernel_entry
-	 * uses x20-x23 to store data for later use.
-	 */
-	.macro	irq_stack_exit
-	mov	sp, x19
-#ifdef CONFIG_SHADOW_CALL_STACK
-	mov	scs_sp, x24
-#endif
-	.endm
-
 /* GPRs used by entry code */
 tsk	.req	x28		// current thread_info
 
 /*
  * Interrupt handling.
  */
-	.macro	irq_handler, handler:req
-	ldr_l	x1, \handler
-	mov	x0, sp
-	irq_stack_entry
-	blr	x1
-	irq_stack_exit
-	.endm
-
 	.macro	gic_prio_kentry_setup, tmp:req
 #ifdef CONFIG_ARM64_PSEUDO_NMI
 	alternative_if ARM64_HAS_IRQ_PRIO_MASKING
@@ -552,35 +495,6 @@ tsk	.req	x28		// current thread_info
 #endif
 	.endm
 
-	.macro el1_interrupt_handler, handler:req
-	enable_da
-
-	mov	x0, sp
-	bl	enter_el1_irq_or_nmi
-
-	irq_handler	\handler
-
-#ifdef CONFIG_PREEMPTION
-	ldr	x24, [tsk, #TSK_TI_PREEMPT]	// get preempt count
-	cbnz	x24, 1f				// preempt count != 0
-	bl	arm64_preempt_schedule_irq	// irq en/disable is done inside
-1:
-#endif
-
-	mov	x0, sp
-	bl	exit_el1_irq_or_nmi
-	.endm
-
-	.macro el0_interrupt_handler, handler:req
-	user_exit_irqoff
-	enable_da
-
-	tbz	x22, #55, 1f
-	bl	do_el0_irq_bp_hardening
-1:
-	irq_handler	\handler
-	.endm
-
 	.text
 
 /*
@@ -704,13 +618,15 @@ SYM_CODE_END(el1_sync)
 	.align	6
 SYM_CODE_START_LOCAL_NOALIGN(el1_irq)
 	kernel_entry 1
-	el1_interrupt_handler handle_arch_irq
+	mov	x0, sp
+	bl	el1_irq_handler
 	kernel_exit 1
 SYM_CODE_END(el1_irq)
 
 SYM_CODE_START_LOCAL_NOALIGN(el1_fiq)
 	kernel_entry 1
-	el1_interrupt_handler handle_arch_fiq
+	mov	x0, sp
+	bl	el1_fiq_handler
 	kernel_exit 1
 SYM_CODE_END(el1_fiq)
 
@@ -737,12 +653,16 @@ SYM_CODE_END(el0_sync_compat)
 	.align	6
 SYM_CODE_START_LOCAL_NOALIGN(el0_irq_compat)
 	kernel_entry 0, 32
-	b	el0_irq_naked
+	mov	x0, sp
+	bl	el0_irq_compat_handler
+	b	ret_to_user
 SYM_CODE_END(el0_irq_compat)
 
 SYM_CODE_START_LOCAL_NOALIGN(el0_fiq_compat)
 	kernel_entry 0, 32
-	b	el0_fiq_naked
+	mov	x0, sp
+	bl	el0_fiq_compat_handler
+	b	ret_to_user
 SYM_CODE_END(el0_fiq_compat)
 
 SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat)
@@ -756,15 +676,15 @@ SYM_CODE_END(el0_error_compat)
 	.align	6
 SYM_CODE_START_LOCAL_NOALIGN(el0_irq)
 	kernel_entry 0
-el0_irq_naked:
-	el0_interrupt_handler handle_arch_irq
+	mov	x0, sp
+	bl	el0_irq_handler
 	b	ret_to_user
 SYM_CODE_END(el0_irq)
 
 SYM_CODE_START_LOCAL_NOALIGN(el0_fiq)
 	kernel_entry 0
-el0_fiq_naked:
-	el0_interrupt_handler handle_arch_fiq
+	mov	x0, sp
+	bl	el0_fiq_handler
 	b	ret_to_user
 SYM_CODE_END(el0_fiq)
 
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 871c82ab0a30..3b4a4adfddfd 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -836,13 +836,6 @@ void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_mem_abort);
 
-void do_el0_irq_bp_hardening(void)
-{
-	/* PC has already been checked in entry.S */
-	arm64_apply_bp_hardening();
-}
-NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
-
 void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
 	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
-- 
cgit v1.2.3


From 2f2bbaa4eda027d0bf0f3f23d0c206b2b76e2180 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:12 +0100
Subject: arm64: entry: organise entry handlers consistently

In entry.S we have two comments which distinguish EL0 and EL1 exception
handlers, but the code isn't actually laid out to match, and there are a
few other inconsistencies that would be good to clear up.

This patch organizes the entry handers consistently:

* The handlers are laid out in order of the vectors, to make them easier
  to navigate.

* The inconsistently-applied alignment is removed

* The handlers are consistently marked with SYM_CODE_START_LOCAL()
  rather than SYM_CODE_START_LOCAL_NOALIGN(), giving them the same
  default alignment as other assembly code snippets.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-9-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry.S | 78 ++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 42 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 8eb3a0a51413..ed7c55d57afe 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -607,65 +607,88 @@ SYM_CODE_END(el1_error_invalid)
 /*
  * EL1 mode handlers.
  */
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
+SYM_CODE_START_LOCAL(el1_sync)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_sync_handler
 	kernel_exit 1
 SYM_CODE_END(el1_sync)
 
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el1_irq)
+SYM_CODE_START_LOCAL(el1_irq)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_irq_handler
 	kernel_exit 1
 SYM_CODE_END(el1_irq)
 
-SYM_CODE_START_LOCAL_NOALIGN(el1_fiq)
+SYM_CODE_START_LOCAL(el1_fiq)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_fiq_handler
 	kernel_exit 1
 SYM_CODE_END(el1_fiq)
 
+SYM_CODE_START_LOCAL(el1_error)
+	kernel_entry 1
+	mov	x0, sp
+	bl	el1_error_handler
+	kernel_exit 1
+SYM_CODE_END(el1_error)
+
 /*
  * EL0 mode handlers.
  */
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_sync)
+SYM_CODE_START_LOCAL(el0_sync)
 	kernel_entry 0
 	mov	x0, sp
 	bl	el0_sync_handler
 	b	ret_to_user
 SYM_CODE_END(el0_sync)
 
+SYM_CODE_START_LOCAL(el0_irq)
+	kernel_entry 0
+	mov	x0, sp
+	bl	el0_irq_handler
+	b	ret_to_user
+SYM_CODE_END(el0_irq)
+
+SYM_CODE_START_LOCAL(el0_fiq)
+	kernel_entry 0
+	mov	x0, sp
+	bl	el0_fiq_handler
+	b	ret_to_user
+SYM_CODE_END(el0_fiq)
+
+SYM_CODE_START_LOCAL(el0_error)
+	kernel_entry 0
+	mov	x0, sp
+	bl	el0_error_handler
+	b	ret_to_user
+SYM_CODE_END(el0_error)
+
 #ifdef CONFIG_COMPAT
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_sync_compat)
+SYM_CODE_START_LOCAL(el0_sync_compat)
 	kernel_entry 0, 32
 	mov	x0, sp
 	bl	el0_sync_compat_handler
 	b	ret_to_user
 SYM_CODE_END(el0_sync_compat)
 
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_irq_compat)
+SYM_CODE_START_LOCAL(el0_irq_compat)
 	kernel_entry 0, 32
 	mov	x0, sp
 	bl	el0_irq_compat_handler
 	b	ret_to_user
 SYM_CODE_END(el0_irq_compat)
 
-SYM_CODE_START_LOCAL_NOALIGN(el0_fiq_compat)
+SYM_CODE_START_LOCAL(el0_fiq_compat)
 	kernel_entry 0, 32
 	mov	x0, sp
 	bl	el0_fiq_compat_handler
 	b	ret_to_user
 SYM_CODE_END(el0_fiq_compat)
 
-SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat)
+SYM_CODE_START_LOCAL(el0_error_compat)
 	kernel_entry 0, 32
 	mov	x0, sp
 	bl	el0_error_compat_handler
@@ -673,35 +696,6 @@ SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat)
 SYM_CODE_END(el0_error_compat)
 #endif
 
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_irq)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_irq_handler
-	b	ret_to_user
-SYM_CODE_END(el0_irq)
-
-SYM_CODE_START_LOCAL_NOALIGN(el0_fiq)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_fiq_handler
-	b	ret_to_user
-SYM_CODE_END(el0_fiq)
-
-SYM_CODE_START_LOCAL(el1_error)
-	kernel_entry 1
-	mov	x0, sp
-	bl	el1_error_handler
-	kernel_exit 1
-SYM_CODE_END(el1_error)
-
-SYM_CODE_START_LOCAL(el0_error)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_error_handler
-	b	ret_to_user
-SYM_CODE_END(el0_error)
-
 /*
  * "slow" syscall return path.
  */
-- 
cgit v1.2.3


From e931fa03c6bf525babc9a41b951eb2311b055abb Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:13 +0100
Subject: arm64: entry: organise entry vectors consistently

In subsequent patches we'll rename the entry handlers based on their
original EL, register width, and exception class. To do so, we need to
make all 3 mandatory arguments to the `kernel_ventry` macro, and
distinguish EL1h from EL1t.

In preparation for this, let's make the current set of arguments
mandatory, and move the `regsize` column before the branch label suffix,
making the vectors easier to read column-wise.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-10-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry.S | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ed7c55d57afe..e29d0fb77358 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -54,7 +54,7 @@
 #define BAD_FIQ		2
 #define BAD_ERROR	3
 
-	.macro kernel_ventry, el, label, regsize = 64
+	.macro kernel_ventry, el:req, regsize:req, label:req
 	.align 7
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	.if	\el == 0
@@ -504,31 +504,31 @@ tsk	.req	x28		// current thread_info
 
 	.align	11
 SYM_CODE_START(vectors)
-	kernel_ventry	1, sync_invalid			// Synchronous EL1t
-	kernel_ventry	1, irq_invalid			// IRQ EL1t
-	kernel_ventry	1, fiq_invalid			// FIQ EL1t
-	kernel_ventry	1, error_invalid		// Error EL1t
+	kernel_ventry	1, 64, sync_invalid		// Synchronous EL1t
+	kernel_ventry	1, 64, irq_invalid		// IRQ EL1t
+	kernel_ventry	1, 64, fiq_invalid		// FIQ EL1t
+	kernel_ventry	1, 64, error_invalid		// Error EL1t
 
-	kernel_ventry	1, sync				// Synchronous EL1h
-	kernel_ventry	1, irq				// IRQ EL1h
-	kernel_ventry	1, fiq				// FIQ EL1h
-	kernel_ventry	1, error			// Error EL1h
+	kernel_ventry	1, 64, sync			// Synchronous EL1h
+	kernel_ventry	1, 64, irq			// IRQ EL1h
+	kernel_ventry	1, 64, fiq			// FIQ EL1h
+	kernel_ventry	1, 64, error			// Error EL1h
 
-	kernel_ventry	0, sync				// Synchronous 64-bit EL0
-	kernel_ventry	0, irq				// IRQ 64-bit EL0
-	kernel_ventry	0, fiq				// FIQ 64-bit EL0
-	kernel_ventry	0, error			// Error 64-bit EL0
+	kernel_ventry	0, 64, sync			// Synchronous 64-bit EL0
+	kernel_ventry	0, 64, irq			// IRQ 64-bit EL0
+	kernel_ventry	0, 64, fiq			// FIQ 64-bit EL0
+	kernel_ventry	0, 64, error			// Error 64-bit EL0
 
 #ifdef CONFIG_COMPAT
-	kernel_ventry	0, sync_compat, 32		// Synchronous 32-bit EL0
-	kernel_ventry	0, irq_compat, 32		// IRQ 32-bit EL0
-	kernel_ventry	0, fiq_compat, 32		// FIQ 32-bit EL0
-	kernel_ventry	0, error_compat, 32		// Error 32-bit EL0
+	kernel_ventry	0, 32, sync_compat		// Synchronous 32-bit EL0
+	kernel_ventry	0, 32, irq_compat		// IRQ 32-bit EL0
+	kernel_ventry	0, 32, fiq_compat		// FIQ 32-bit EL0
+	kernel_ventry	0, 32, error_compat		// Error 32-bit EL0
 #else
-	kernel_ventry	0, sync_invalid, 32		// Synchronous 32-bit EL0
-	kernel_ventry	0, irq_invalid, 32		// IRQ 32-bit EL0
-	kernel_ventry	0, fiq_invalid, 32		// FIQ 32-bit EL0
-	kernel_ventry	0, error_invalid, 32		// Error 32-bit EL0
+	kernel_ventry	0, 32, sync_invalid		// Synchronous 32-bit EL0
+	kernel_ventry	0, 32, irq_invalid		// IRQ 32-bit EL0
+	kernel_ventry	0, 32, fiq_invalid		// FIQ 32-bit EL0
+	kernel_ventry	0, 32, error_invalid		// Error 32-bit EL0
 #endif
 SYM_CODE_END(vectors)
 
-- 
cgit v1.2.3


From af541cbbf9c646d2eaa8b3ee3836d5b16435e848 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:14 +0100
Subject: arm64: entry: consolidate EL1 exception returns

Following the example of ret_to_user, let's consolidate all the EL1
return paths with a ret_to_kernel helper, rather than each entry point
having its own copy of the return code.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-11-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry.S | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e29d0fb77358..54986d488983 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -611,30 +611,34 @@ SYM_CODE_START_LOCAL(el1_sync)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_sync_handler
-	kernel_exit 1
+	b	ret_to_kernel
 SYM_CODE_END(el1_sync)
 
 SYM_CODE_START_LOCAL(el1_irq)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_irq_handler
-	kernel_exit 1
+	b	ret_to_kernel
 SYM_CODE_END(el1_irq)
 
 SYM_CODE_START_LOCAL(el1_fiq)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_fiq_handler
-	kernel_exit 1
+	b	ret_to_kernel
 SYM_CODE_END(el1_fiq)
 
 SYM_CODE_START_LOCAL(el1_error)
 	kernel_entry 1
 	mov	x0, sp
 	bl	el1_error_handler
-	kernel_exit 1
+	b	ret_to_kernel
 SYM_CODE_END(el1_error)
 
+SYM_CODE_START_LOCAL(ret_to_kernel)
+	kernel_exit 1
+SYM_CODE_END(ret_to_kernel)
+
 /*
  * EL0 mode handlers.
  */
-- 
cgit v1.2.3


From cbed5f8d3feb5ecc84c998b81db7e004b3fb2135 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:15 +0100
Subject: arm64: entry: move bad_mode() to entry-common.c

In subsequent patches we'll rework the way bad_mode() is called by
exception entry code. In preparation for this, let's move bad_mode()
itself into entry-common.c.

Let's also mark it as noinstr (e.g. to prevent it being kprobed), and
let's also make the `handler` array a local variable, as this is only
use by bad_mode(), and will be removed entirely in a subsequent patch.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-12-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-common.c | 27 +++++++++++++++++++++++++++
 arch/arm64/kernel/traps.c        | 25 -------------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index ae1b6d7c00e1..74d09fd3dafa 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,6 +22,7 @@
 #include <asm/processor.h>
 #include <asm/stacktrace.h>
 #include <asm/sysreg.h>
+#include <asm/system_misc.h>
 
 /*
  * This is intended to match the logic in irqentry_enter(), handling the kernel
@@ -156,6 +157,32 @@ static void do_interrupt_handler(struct pt_regs *regs,
 extern void (*handle_arch_irq)(struct pt_regs *);
 extern void (*handle_arch_fiq)(struct pt_regs *);
 
+/*
+ * bad_mode handles the impossible case in the exception vector. This is always
+ * fatal.
+ */
+asmlinkage void noinstr bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
+{
+	const char *handler[] = {
+		"Synchronous Abort",
+		"IRQ",
+		"FIQ",
+		"Error"
+	};
+
+	arm64_enter_nmi(regs);
+
+	console_verbose();
+
+	pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n",
+		handler[reason], smp_processor_id(), esr,
+		esr_get_class_string(esr));
+
+	__show_regs(regs);
+	panic("bad mode");
+}
+
+
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5fd12d19ef4b..7def18ff02e2 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -45,13 +45,6 @@
 #include <asm/system_misc.h>
 #include <asm/sysreg.h>
 
-static const char *handler[] = {
-	"Synchronous Abort",
-	"IRQ",
-	"FIQ",
-	"Error"
-};
-
 int show_unhandled_signals = 0;
 
 static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
@@ -750,24 +743,6 @@ const char *esr_get_class_string(u32 esr)
 	return esr_class_str[ESR_ELx_EC(esr)];
 }
 
-/*
- * bad_mode handles the impossible case in the exception vector. This is always
- * fatal.
- */
-asmlinkage void notrace bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
-{
-	arm64_enter_nmi(regs);
-
-	console_verbose();
-
-	pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n",
-		handler[reason], smp_processor_id(), esr,
-		esr_get_class_string(esr));
-
-	__show_regs(regs);
-	panic("bad mode");
-}
-
 /*
  * bad_el0_sync handles unexpected, but potentially recoverable synchronous
  * exceptions taken from EL0. Unlike bad_mode, this returns.
-- 
cgit v1.2.3


From ca0c2647f54c34000b4026c6632268d2dc304c67 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:16 +0100
Subject: arm64: entry: improve bad_mode()

Our use of bad_mode() has a few rough edges:

* AArch64 doesn't use the term "mode", and refers to "Execution
  states", "Exception levels", and "Selected stack pointer".

* We log the exception type (SYNC/IRQ/FIQ/SError), but not the actual
  "mode" (though this can be decoded from the SPSR value).

* We use bad_mode() as a second-level handler for unexpected synchronous
  exceptions, where the "mode" is legitimate, but the specific exception
  is not.

* We dump the ESR value, but call this "code", and so it's not clear to
  all readers that this is the ESR.

... and all of this can be somewhat opaque to those who aren't extremely
familiar with the code.

Let's make this a bit clearer by having bad_mode() log "Unhandled
${TYPE} exception" rather than "Bad mode in ${TYPE} handler", using
"ESR" rather than "code", and having the final panic() log "Unhandled
exception" rather than "Bad mode".

In future we'd like to log the specific architectural vector rather than
just the type of exception, so we also split the core of bad_mode() out
into a helper called __panic_unhandled(), which takes the vector as a
string argument.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-13-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-common.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 74d09fd3dafa..d0f9a6394067 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -157,31 +157,32 @@ static void do_interrupt_handler(struct pt_regs *regs,
 extern void (*handle_arch_irq)(struct pt_regs *);
 extern void (*handle_arch_fiq)(struct pt_regs *);
 
-/*
- * bad_mode handles the impossible case in the exception vector. This is always
- * fatal.
- */
-asmlinkage void noinstr bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
+static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector,
+				      unsigned int esr)
 {
-	const char *handler[] = {
-		"Synchronous Abort",
-		"IRQ",
-		"FIQ",
-		"Error"
-	};
-
 	arm64_enter_nmi(regs);
 
 	console_verbose();
 
-	pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n",
-		handler[reason], smp_processor_id(), esr,
+	pr_crit("Unhandled %s exception on CPU%d, ESR 0x%08x -- %s\n",
+		vector, smp_processor_id(), esr,
 		esr_get_class_string(esr));
 
 	__show_regs(regs);
-	panic("bad mode");
+	panic("Unhandled exception");
 }
 
+asmlinkage void noinstr bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
+{
+	const char *handler[] = {
+		"Synchronous Abort",
+		"IRQ",
+		"FIQ",
+		"Error"
+	};
+
+	__panic_unhandled(regs, handler[reason], esr);
+}
 
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
-- 
cgit v1.2.3


From a5b43a87a7609d49ed4a453a2b99b6d36ab1e5d0 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:17 +0100
Subject: arm64: entry: template the entry asm functions

Now that the majority of the exception triage logic has been converted
to C, the entry assembly functions all have a uniform structure.

Let's generate them all with an assembly macro to reduce the amount of
code and to ensure they all remain in sync if we make changes in future.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-14-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry.S | 113 +++++++++++-----------------------------------
 1 file changed, 27 insertions(+), 86 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 54986d488983..b719ac26f7d1 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -604,102 +604,43 @@ SYM_CODE_START_LOCAL(el1_error_invalid)
 	inv_entry 1, BAD_ERROR
 SYM_CODE_END(el1_error_invalid)
 
-/*
- * EL1 mode handlers.
- */
-SYM_CODE_START_LOCAL(el1_sync)
-	kernel_entry 1
-	mov	x0, sp
-	bl	el1_sync_handler
-	b	ret_to_kernel
-SYM_CODE_END(el1_sync)
-
-SYM_CODE_START_LOCAL(el1_irq)
-	kernel_entry 1
-	mov	x0, sp
-	bl	el1_irq_handler
-	b	ret_to_kernel
-SYM_CODE_END(el1_irq)
-
-SYM_CODE_START_LOCAL(el1_fiq)
-	kernel_entry 1
-	mov	x0, sp
-	bl	el1_fiq_handler
-	b	ret_to_kernel
-SYM_CODE_END(el1_fiq)
-
-SYM_CODE_START_LOCAL(el1_error)
-	kernel_entry 1
+	.macro entry_handler el:req, regsize:req, label:req
+SYM_CODE_START_LOCAL(el\el\()_\label)
+	kernel_entry \el, \regsize
 	mov	x0, sp
-	bl	el1_error_handler
+	bl	el\el\()_\label\()_handler
+	.if \el == 0
+	b	ret_to_user
+	.else
 	b	ret_to_kernel
-SYM_CODE_END(el1_error)
-
-SYM_CODE_START_LOCAL(ret_to_kernel)
-	kernel_exit 1
-SYM_CODE_END(ret_to_kernel)
+	.endif
+SYM_CODE_END(el\el\()_\label)
+	.endm
 
 /*
- * EL0 mode handlers.
+ * Early exception handlers
  */
-SYM_CODE_START_LOCAL(el0_sync)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_sync_handler
-	b	ret_to_user
-SYM_CODE_END(el0_sync)
+	entry_handler	1, 64, sync
+	entry_handler	1, 64, irq
+	entry_handler	1, 64, fiq
+	entry_handler	1, 64, error
 
-SYM_CODE_START_LOCAL(el0_irq)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_irq_handler
-	b	ret_to_user
-SYM_CODE_END(el0_irq)
-
-SYM_CODE_START_LOCAL(el0_fiq)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_fiq_handler
-	b	ret_to_user
-SYM_CODE_END(el0_fiq)
-
-SYM_CODE_START_LOCAL(el0_error)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_error_handler
-	b	ret_to_user
-SYM_CODE_END(el0_error)
+	entry_handler	0, 64, sync
+	entry_handler	0, 64, irq
+	entry_handler	0, 64, fiq
+	entry_handler	0, 64, error
 
 #ifdef CONFIG_COMPAT
-SYM_CODE_START_LOCAL(el0_sync_compat)
-	kernel_entry 0, 32
-	mov	x0, sp
-	bl	el0_sync_compat_handler
-	b	ret_to_user
-SYM_CODE_END(el0_sync_compat)
-
-SYM_CODE_START_LOCAL(el0_irq_compat)
-	kernel_entry 0, 32
-	mov	x0, sp
-	bl	el0_irq_compat_handler
-	b	ret_to_user
-SYM_CODE_END(el0_irq_compat)
-
-SYM_CODE_START_LOCAL(el0_fiq_compat)
-	kernel_entry 0, 32
-	mov	x0, sp
-	bl	el0_fiq_compat_handler
-	b	ret_to_user
-SYM_CODE_END(el0_fiq_compat)
-
-SYM_CODE_START_LOCAL(el0_error_compat)
-	kernel_entry 0, 32
-	mov	x0, sp
-	bl	el0_error_compat_handler
-	b	ret_to_user
-SYM_CODE_END(el0_error_compat)
+	entry_handler	0, 32, sync_compat
+	entry_handler	0, 32, irq_compat
+	entry_handler	0, 32, fiq_compat
+	entry_handler	0, 32, error_compat
 #endif
 
+SYM_CODE_START_LOCAL(ret_to_kernel)
+	kernel_exit 1
+SYM_CODE_END(ret_to_kernel)
+
 /*
  * "slow" syscall return path.
  */
-- 
cgit v1.2.3


From ec841aab8d3cdd23decdcf0c47292e14627446c1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:18 +0100
Subject: arm64: entry: handle all vectors with C

We have 16 architectural exception vectors, and depending on kernel
configuration we handle 8 or 12 of these with C code, with the remaining
8 or 4 of these handled as special cases in the entry assembly.

It would be nicer if the entry assembly were uniform for all exceptions,
and we deferred any specific handling of the exceptions to C code. This
way the entry assembly can be more easily templated without ifdeffery or
special cases, and it's easier to modify the handling of these cases in
future (e.g. to dump additional registers other context).

This patch reworks the entry code so that we always have a C handler for
every architectural exception vector, with the entry assembly being
completely uniform. We now have to handle exceptions from EL1t and EL1h,
and also have to handle exceptions from AArch32 even when the kernel is
built without CONFIG_COMPAT. To make this clear and to simplify
templating, we rename the top-level exception handlers with a consistent
naming scheme:

  asm: <el+sp>_<regsize>_<type>
  c:   <el+sp>_<regsize>_<type>_handler

.. where:

  <el+sp> is `el1t`, `el1h`, or `el0t`
  <regsize> is `64` or `32`
  <type> is `sync`, `irq`, `fiq`, or `error`

... e.g.

  asm: el1h_64_sync
  c:   el1h_64_sync_handler

... with lower-level handlers simply using "el1" and "compat" as today.

For unexpected exceptions, this information is passed to
__panic_unhandled(), so it can report the specific vector an unexpected
exception was taken from, e.g.

| Unhandled 64-bit el1t sync exception

For vectors we never expect to enter legitimately, the C code is
generated using a macro to avoid code duplication. The exceptions are
handled via __panic_unhandled(), replacing bad_mode() (which is
removed).

The `kernel_ventry` and `entry_handler` assembly macros are updated to
handle the new naming scheme. In theory it should be possible to
generate the entry functions at the same time as the vectors using a
single table, but this will require reworking the linker script to split
the two into separate sections, so for now we have separate tables.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-15-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/exception.h |  32 ++++----
 arch/arm64/kernel/entry-common.c   |  51 +++++++------
 arch/arm64/kernel/entry.S          | 146 ++++++++++++-------------------------
 arch/arm64/kernel/traps.c          |   2 +-
 4 files changed, 93 insertions(+), 138 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 4284ee57a9a5..ad30a5a1d2bf 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -31,18 +31,25 @@ static inline u32 disr_to_esr(u64 disr)
 	return esr;
 }
 
-asmlinkage void el1_sync_handler(struct pt_regs *regs);
-asmlinkage void el1_irq_handler(struct pt_regs *regs);
-asmlinkage void el1_fiq_handler(struct pt_regs *regs);
-asmlinkage void el1_error_handler(struct pt_regs *regs);
-asmlinkage void el0_sync_handler(struct pt_regs *regs);
-asmlinkage void el0_irq_handler(struct pt_regs *regs);
-asmlinkage void el0_fiq_handler(struct pt_regs *regs);
-asmlinkage void el0_error_handler(struct pt_regs *regs);
-asmlinkage void el0_sync_compat_handler(struct pt_regs *regs);
-asmlinkage void el0_irq_compat_handler(struct pt_regs *regs);
-asmlinkage void el0_fiq_compat_handler(struct pt_regs *regs);
-asmlinkage void el0_error_compat_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_sync_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_irq_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_error_handler(struct pt_regs *regs);
+
+asmlinkage void el1h_64_sync_handler(struct pt_regs *regs);
+asmlinkage void el1h_64_irq_handler(struct pt_regs *regs);
+asmlinkage void el1h_64_fiq_handler(struct pt_regs *regs);
+asmlinkage void el1h_64_error_handler(struct pt_regs *regs);
+
+asmlinkage void el0t_64_sync_handler(struct pt_regs *regs);
+asmlinkage void el0t_64_irq_handler(struct pt_regs *regs);
+asmlinkage void el0t_64_fiq_handler(struct pt_regs *regs);
+asmlinkage void el0t_64_error_handler(struct pt_regs *regs);
+
+asmlinkage void el0t_32_sync_handler(struct pt_regs *regs);
+asmlinkage void el0t_32_irq_handler(struct pt_regs *regs);
+asmlinkage void el0t_32_fiq_handler(struct pt_regs *regs);
+asmlinkage void el0t_32_error_handler(struct pt_regs *regs);
 
 asmlinkage void call_on_irq_stack(struct pt_regs *regs,
 				  void (*func)(struct pt_regs *));
@@ -53,7 +60,6 @@ void arm64_exit_nmi(struct pt_regs *regs);
 void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 void do_bti(struct pt_regs *regs);
-asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 			struct pt_regs *regs);
 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs);
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index d0f9a6394067..dd6403b748f2 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -172,16 +172,11 @@ static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector,
 	panic("Unhandled exception");
 }
 
-asmlinkage void noinstr bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
-{
-	const char *handler[] = {
-		"Synchronous Abort",
-		"IRQ",
-		"FIQ",
-		"Error"
-	};
-
-	__panic_unhandled(regs, handler[reason], esr);
+#define UNHANDLED(el, regsize, vector)							\
+asmlinkage void noinstr el##_##regsize##_##vector##_handler(struct pt_regs *regs)	\
+{											\
+	const char *desc = #regsize "-bit " #el " " #vector;				\
+	__panic_unhandled(regs, desc, read_sysreg(esr_el1));				\
 }
 
 #ifdef CONFIG_ARM64_ERRATUM_1463225
@@ -233,6 +228,11 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
 }
 #endif /* CONFIG_ARM64_ERRATUM_1463225 */
 
+UNHANDLED(el1t, 64, sync)
+UNHANDLED(el1t, 64, irq)
+UNHANDLED(el1t, 64, fiq)
+UNHANDLED(el1t, 64, error)
+
 static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
@@ -268,7 +268,7 @@ static void noinstr el1_inv(struct pt_regs *regs, unsigned long esr)
 {
 	enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
-	bad_mode(regs, 0, esr);
+	__panic_unhandled(regs, "64-bit el1h sync", esr);
 	local_daif_mask();
 	exit_to_kernel_mode(regs);
 }
@@ -316,7 +316,7 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
 	exit_to_kernel_mode(regs);
 }
 
-asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs)
+asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -370,17 +370,17 @@ static void noinstr el1_interrupt(struct pt_regs *regs,
 	exit_el1_irq_or_nmi(regs);
 }
 
-asmlinkage void noinstr el1_irq_handler(struct pt_regs *regs)
+asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs)
 {
 	el1_interrupt(regs, handle_arch_irq);
 }
 
-asmlinkage void noinstr el1_fiq_handler(struct pt_regs *regs)
+asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs)
 {
 	el1_interrupt(regs, handle_arch_fiq);
 }
 
-asmlinkage void noinstr el1_error_handler(struct pt_regs *regs)
+asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -526,7 +526,7 @@ static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr)
 	do_ptrauth_fault(regs, esr);
 }
 
-asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -597,7 +597,7 @@ static void noinstr __el0_irq_handler_common(struct pt_regs *regs)
 	el0_interrupt(regs, handle_arch_irq);
 }
 
-asmlinkage void noinstr el0_irq_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_64_irq_handler(struct pt_regs *regs)
 {
 	__el0_irq_handler_common(regs);
 }
@@ -607,7 +607,7 @@ static void noinstr __el0_fiq_handler_common(struct pt_regs *regs)
 	el0_interrupt(regs, handle_arch_fiq);
 }
 
-asmlinkage void noinstr el0_fiq_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs)
 {
 	__el0_fiq_handler_common(regs);
 }
@@ -624,7 +624,7 @@ static void __el0_error_handler_common(struct pt_regs *regs)
 	local_daif_restore(DAIF_PROCCTX);
 }
 
-asmlinkage void noinstr el0_error_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs)
 {
 	__el0_error_handler_common(regs);
 }
@@ -644,7 +644,7 @@ static void noinstr el0_svc_compat(struct pt_regs *regs)
 	do_el0_svc_compat(regs);
 }
 
-asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -688,18 +688,23 @@ asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs)
 	}
 }
 
-asmlinkage void noinstr el0_irq_compat_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_32_irq_handler(struct pt_regs *regs)
 {
 	__el0_irq_handler_common(regs);
 }
 
-asmlinkage void noinstr el0_fiq_compat_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_32_fiq_handler(struct pt_regs *regs)
 {
 	__el0_fiq_handler_common(regs);
 }
 
-asmlinkage void noinstr el0_error_compat_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_32_error_handler(struct pt_regs *regs)
 {
 	__el0_error_handler_common(regs);
 }
+#else /* CONFIG_COMPAT */
+UNHANDLED(el0t, 32, sync)
+UNHANDLED(el0t, 32, irq)
+UNHANDLED(el0t, 32, fiq)
+UNHANDLED(el0t, 32, error)
 #endif /* CONFIG_COMPAT */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index b719ac26f7d1..d43a12dfd189 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -45,16 +45,7 @@
 	.endr
 	.endm
 
-/*
- * Bad Abort numbers
- *-----------------
- */
-#define BAD_SYNC	0
-#define BAD_IRQ		1
-#define BAD_FIQ		2
-#define BAD_ERROR	3
-
-	.macro kernel_ventry, el:req, regsize:req, label:req
+	.macro kernel_ventry, el:req, ht:req, regsize:req, label:req
 	.align 7
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	.if	\el == 0
@@ -81,7 +72,7 @@ alternative_else_nop_endif
 	tbnz	x0, #THREAD_SHIFT, 0f
 	sub	x0, sp, x0			// x0'' = sp' - x0' = (sp + x0) - sp = x0
 	sub	sp, sp, x0			// sp'' = sp' - x0 = (sp + x0) - x0 = sp
-	b	el\()\el\()_\label
+	b	el\el\ht\()_\regsize\()_\label
 
 0:
 	/*
@@ -113,7 +104,7 @@ alternative_else_nop_endif
 	sub	sp, sp, x0
 	mrs	x0, tpidrro_el0
 #endif
-	b	el\()\el\()_\label
+	b	el\el\ht\()_\regsize\()_\label
 	.endm
 
 	.macro tramp_alias, dst, sym
@@ -504,32 +495,25 @@ tsk	.req	x28		// current thread_info
 
 	.align	11
 SYM_CODE_START(vectors)
-	kernel_ventry	1, 64, sync_invalid		// Synchronous EL1t
-	kernel_ventry	1, 64, irq_invalid		// IRQ EL1t
-	kernel_ventry	1, 64, fiq_invalid		// FIQ EL1t
-	kernel_ventry	1, 64, error_invalid		// Error EL1t
-
-	kernel_ventry	1, 64, sync			// Synchronous EL1h
-	kernel_ventry	1, 64, irq			// IRQ EL1h
-	kernel_ventry	1, 64, fiq			// FIQ EL1h
-	kernel_ventry	1, 64, error			// Error EL1h
-
-	kernel_ventry	0, 64, sync			// Synchronous 64-bit EL0
-	kernel_ventry	0, 64, irq			// IRQ 64-bit EL0
-	kernel_ventry	0, 64, fiq			// FIQ 64-bit EL0
-	kernel_ventry	0, 64, error			// Error 64-bit EL0
-
-#ifdef CONFIG_COMPAT
-	kernel_ventry	0, 32, sync_compat		// Synchronous 32-bit EL0
-	kernel_ventry	0, 32, irq_compat		// IRQ 32-bit EL0
-	kernel_ventry	0, 32, fiq_compat		// FIQ 32-bit EL0
-	kernel_ventry	0, 32, error_compat		// Error 32-bit EL0
-#else
-	kernel_ventry	0, 32, sync_invalid		// Synchronous 32-bit EL0
-	kernel_ventry	0, 32, irq_invalid		// IRQ 32-bit EL0
-	kernel_ventry	0, 32, fiq_invalid		// FIQ 32-bit EL0
-	kernel_ventry	0, 32, error_invalid		// Error 32-bit EL0
-#endif
+	kernel_ventry	1, t, 64, sync		// Synchronous EL1t
+	kernel_ventry	1, t, 64, irq		// IRQ EL1t
+	kernel_ventry	1, t, 64, fiq		// FIQ EL1h
+	kernel_ventry	1, t, 64, error		// Error EL1t
+
+	kernel_ventry	1, h, 64, sync		// Synchronous EL1h
+	kernel_ventry	1, h, 64, irq		// IRQ EL1h
+	kernel_ventry	1, h, 64, fiq		// FIQ EL1h
+	kernel_ventry	1, h, 64, error		// Error EL1h
+
+	kernel_ventry	0, t, 64, sync		// Synchronous 64-bit EL0
+	kernel_ventry	0, t, 64, irq		// IRQ 64-bit EL0
+	kernel_ventry	0, t, 64, fiq		// FIQ 64-bit EL0
+	kernel_ventry	0, t, 64, error		// Error 64-bit EL0
+
+	kernel_ventry	0, t, 32, sync		// Synchronous 32-bit EL0
+	kernel_ventry	0, t, 32, irq		// IRQ 32-bit EL0
+	kernel_ventry	0, t, 32, fiq		// FIQ 32-bit EL0
+	kernel_ventry	0, t, 32, error		// Error 32-bit EL0
 SYM_CODE_END(vectors)
 
 #ifdef CONFIG_VMAP_STACK
@@ -560,82 +544,42 @@ __bad_stack:
 	ASM_BUG()
 #endif /* CONFIG_VMAP_STACK */
 
-/*
- * Invalid mode handlers
- */
-	.macro	inv_entry, el, reason, regsize = 64
-	kernel_entry \el, \regsize
-	mov	x0, sp
-	mov	x1, #\reason
-	mrs	x2, esr_el1
-	bl	bad_mode
-	ASM_BUG()
-	.endm
-
-SYM_CODE_START_LOCAL(el0_sync_invalid)
-	inv_entry 0, BAD_SYNC
-SYM_CODE_END(el0_sync_invalid)
-
-SYM_CODE_START_LOCAL(el0_irq_invalid)
-	inv_entry 0, BAD_IRQ
-SYM_CODE_END(el0_irq_invalid)
-
-SYM_CODE_START_LOCAL(el0_fiq_invalid)
-	inv_entry 0, BAD_FIQ
-SYM_CODE_END(el0_fiq_invalid)
-
-SYM_CODE_START_LOCAL(el0_error_invalid)
-	inv_entry 0, BAD_ERROR
-SYM_CODE_END(el0_error_invalid)
 
-SYM_CODE_START_LOCAL(el1_sync_invalid)
-	inv_entry 1, BAD_SYNC
-SYM_CODE_END(el1_sync_invalid)
-
-SYM_CODE_START_LOCAL(el1_irq_invalid)
-	inv_entry 1, BAD_IRQ
-SYM_CODE_END(el1_irq_invalid)
-
-SYM_CODE_START_LOCAL(el1_fiq_invalid)
-	inv_entry 1, BAD_FIQ
-SYM_CODE_END(el1_fiq_invalid)
-
-SYM_CODE_START_LOCAL(el1_error_invalid)
-	inv_entry 1, BAD_ERROR
-SYM_CODE_END(el1_error_invalid)
-
-	.macro entry_handler el:req, regsize:req, label:req
-SYM_CODE_START_LOCAL(el\el\()_\label)
+	.macro entry_handler el:req, ht:req, regsize:req, label:req
+SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label)
 	kernel_entry \el, \regsize
 	mov	x0, sp
-	bl	el\el\()_\label\()_handler
+	bl	el\el\ht\()_\regsize\()_\label\()_handler
 	.if \el == 0
 	b	ret_to_user
 	.else
 	b	ret_to_kernel
 	.endif
-SYM_CODE_END(el\el\()_\label)
+SYM_CODE_END(el\el\ht\()_\regsize\()_\label)
 	.endm
 
 /*
  * Early exception handlers
  */
-	entry_handler	1, 64, sync
-	entry_handler	1, 64, irq
-	entry_handler	1, 64, fiq
-	entry_handler	1, 64, error
-
-	entry_handler	0, 64, sync
-	entry_handler	0, 64, irq
-	entry_handler	0, 64, fiq
-	entry_handler	0, 64, error
-
-#ifdef CONFIG_COMPAT
-	entry_handler	0, 32, sync_compat
-	entry_handler	0, 32, irq_compat
-	entry_handler	0, 32, fiq_compat
-	entry_handler	0, 32, error_compat
-#endif
+	entry_handler	1, t, 64, sync
+	entry_handler	1, t, 64, irq
+	entry_handler	1, t, 64, fiq
+	entry_handler	1, t, 64, error
+
+	entry_handler	1, h, 64, sync
+	entry_handler	1, h, 64, irq
+	entry_handler	1, h, 64, fiq
+	entry_handler	1, h, 64, error
+
+	entry_handler	0, t, 64, sync
+	entry_handler	0, t, 64, irq
+	entry_handler	0, t, 64, fiq
+	entry_handler	0, t, 64, error
+
+	entry_handler	0, t, 32, sync
+	entry_handler	0, t, 32, irq
+	entry_handler	0, t, 32, fiq
+	entry_handler	0, t, 32, error
 
 SYM_CODE_START_LOCAL(ret_to_kernel)
 	kernel_exit 1
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7def18ff02e2..47d423f7ac81 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -745,7 +745,7 @@ const char *esr_get_class_string(u32 esr)
 
 /*
  * bad_el0_sync handles unexpected, but potentially recoverable synchronous
- * exceptions taken from EL0. Unlike bad_mode, this returns.
+ * exceptions taken from EL0.
  */
 void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 {
-- 
cgit v1.2.3


From afd05e28c9115d01f01d934962634789d069d3fe Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:19 +0100
Subject: arm64: entry: fold el1_inv() into el1h_64_sync_handler()

An unexpected synchronous exception from EL1h could happen at any time,
and for robustness we should treat this as an NMI, making minimal
assumptions about the context the exception was taken from.

Currently el1_inv() assumes we can use enter_from_kernel_mode(), and
also assumes that we should inherit the original DAIF value. Neither of
these are desireable when we take an unexpected exception. Further,
after el1_inv() calls __panic_unhandled(), the remainder of the function
is unreachable, and therefore superfluous.

Let's address this and simplify things by having el1h_64_sync_handler()
call __panic_unhandled() directly, without any of the redundant logic.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reported-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-16-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry-common.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index dd6403b748f2..ce5c8af91d31 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -264,15 +264,6 @@ static void noinstr el1_undef(struct pt_regs *regs)
 	exit_to_kernel_mode(regs);
 }
 
-static void noinstr el1_inv(struct pt_regs *regs, unsigned long esr)
-{
-	enter_from_kernel_mode(regs);
-	local_daif_inherit(regs);
-	__panic_unhandled(regs, "64-bit el1h sync", esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs);
-}
-
 static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs)
 {
 	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
@@ -346,7 +337,7 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 		el1_fpac(regs, esr);
 		break;
 	default:
-		el1_inv(regs, esr);
+		__panic_unhandled(regs, "64-bit el1h sync", esr);
 	}
 }
 
-- 
cgit v1.2.3


From 8168f098867f6584295ea408c683f61e945c6ff1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:20 +0100
Subject: arm64: entry: split bad stack entry

We'd like to keep all the entry sequencing in entry-common.c, as this
will allow us to ensure this is consistent, and free from any unsound
instrumentation.

Currently handle_bad_stack() performs the NMI entry sequence in traps.c.
Let's split the low-level entry sequence from the reporting, moving the
former to entry-common.c and keeping the latter in traps.c. To make it
clear that reporting function never returns, it is renamed to
panic_bad_stack().

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-17-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/exception.h |  4 ++++
 arch/arm64/kernel/entry-common.c   | 11 +++++++++++
 arch/arm64/kernel/traps.c          |  6 +-----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index ad30a5a1d2bf..0113b9242b67 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -31,6 +31,8 @@ static inline u32 disr_to_esr(u64 disr)
 	return esr;
 }
 
+asmlinkage void handle_bad_stack(struct pt_regs *regs);
+
 asmlinkage void el1t_64_sync_handler(struct pt_regs *regs);
 asmlinkage void el1t_64_irq_handler(struct pt_regs *regs);
 asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs);
@@ -73,4 +75,6 @@ void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
 void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr);
 void do_serror(struct pt_regs *regs, unsigned int esr);
+
+void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far);
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index ce5c8af91d31..efe95edf10c0 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -699,3 +699,14 @@ UNHANDLED(el0t, 32, irq)
 UNHANDLED(el0t, 32, fiq)
 UNHANDLED(el0t, 32, error)
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_VMAP_STACK
+asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs)
+{
+	unsigned int esr = read_sysreg(esr_el1);
+	unsigned long far = read_sysreg(far_el1);
+
+	arm64_enter_nmi(regs);
+	panic_bad_stack(regs, esr, far);
+}
+#endif /* CONFIG_VMAP_STACK */
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 47d423f7ac81..af941996eb5f 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -763,15 +763,11 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
 	__aligned(16);
 
-asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs)
+void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far)
 {
 	unsigned long tsk_stk = (unsigned long)current->stack;
 	unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr);
 	unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
-	unsigned int esr = read_sysreg(esr_el1);
-	unsigned long far = read_sysreg(far_el1);
-
-	arm64_enter_nmi(regs);
 
 	console_verbose();
 	pr_emerg("Insufficient stack space to handle exception!");
-- 
cgit v1.2.3


From d60b228fd19985a903b8e8c599be0538a875d505 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:21 +0100
Subject: arm64: entry: split SDEI entry

We'd like to keep all the entry sequencing in entry-common.c, as this
will allow us to ensure this is consistent, and free from any unsound
instrumentation.

Currently __sdei_handler() performs the NMI entry/exit sequences in
sdei.c. Let's split the low-level entry sequence from the event
handling, moving the former to entry-common.c and keeping the latter in
sdei.c. The event handling function is renamed to do_sdei_event(),
matching the do_${FOO}() pattern used for other exception handlers.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-18-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/sdei.h    |  3 +++
 arch/arm64/kernel/entry-common.c | 37 +++++++++++++++++++++++++++++++
 arch/arm64/kernel/sdei.c         | 48 +++-------------------------------------
 3 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 63e0b92a5fbb..03d619a49d4a 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -37,6 +37,9 @@ struct sdei_registered_event;
 asmlinkage unsigned long __sdei_handler(struct pt_regs *regs,
 					struct sdei_registered_event *arg);
 
+unsigned long do_sdei_event(struct pt_regs *regs,
+			    struct sdei_registered_event *arg);
+
 unsigned long sdei_arch_get_entry_point(int conduit);
 #define sdei_arch_get_entry_point(x)	sdei_arch_get_entry_point(x)
 
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index efe95edf10c0..1b32ca3848f5 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -20,6 +20,7 @@
 #include <asm/kprobes.h>
 #include <asm/mmu.h>
 #include <asm/processor.h>
+#include <asm/sdei.h>
 #include <asm/stacktrace.h>
 #include <asm/sysreg.h>
 #include <asm/system_misc.h>
@@ -710,3 +711,39 @@ asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs)
 	panic_bad_stack(regs, esr, far);
 }
 #endif /* CONFIG_VMAP_STACK */
+
+#ifdef CONFIG_ARM_SDE_INTERFACE
+asmlinkage noinstr unsigned long
+__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
+{
+	unsigned long ret;
+
+	/*
+	 * We didn't take an exception to get here, so the HW hasn't
+	 * set/cleared bits in PSTATE that we may rely on.
+	 *
+	 * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to
+	 * whether PSTATE bits are inherited unchanged or generated from
+	 * scratch, and the TF-A implementation always clears PAN and always
+	 * clears UAO. There are no other known implementations.
+	 *
+	 * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how
+	 * PSTATE is modified upon architectural exceptions, and so PAN is
+	 * either inherited or set per SCTLR_ELx.SPAN, and UAO is always
+	 * cleared.
+	 *
+	 * We must explicitly reset PAN to the expected state, including
+	 * clearing it when the host isn't using it, in case a VM had it set.
+	 */
+	if (system_uses_hw_pan())
+		set_pstate_pan(1);
+	else if (cpu_has_pan())
+		set_pstate_pan(0);
+
+	arm64_enter_nmi(regs);
+	ret = do_sdei_event(regs, arg);
+	arm64_exit_nmi(regs);
+
+	return ret;
+}
+#endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 2c7ca449dd51..e72953992743 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -231,13 +231,13 @@ out_err:
 }
 
 /*
- * __sdei_handler() returns one of:
+ * do_sdei_event() returns one of:
  *  SDEI_EV_HANDLED -  success, return to the interrupted context.
  *  SDEI_EV_FAILED  -  failure, return this error code to firmare.
  *  virtual-address -  success, return to this address.
  */
-static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
-					     struct sdei_registered_event *arg)
+unsigned long __kprobes do_sdei_event(struct pt_regs *regs,
+				      struct sdei_registered_event *arg)
 {
 	u32 mode;
 	int i, err = 0;
@@ -292,45 +292,3 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
 
 	return vbar + 0x480;
 }
-
-static void __kprobes notrace __sdei_pstate_entry(void)
-{
-	/*
-	 * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to
-	 * whether PSTATE bits are inherited unchanged or generated from
-	 * scratch, and the TF-A implementation always clears PAN and always
-	 * clears UAO. There are no other known implementations.
-	 *
-	 * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how
-	 * PSTATE is modified upon architectural exceptions, and so PAN is
-	 * either inherited or set per SCTLR_ELx.SPAN, and UAO is always
-	 * cleared.
-	 *
-	 * We must explicitly reset PAN to the expected state, including
-	 * clearing it when the host isn't using it, in case a VM had it set.
-	 */
-	if (system_uses_hw_pan())
-		set_pstate_pan(1);
-	else if (cpu_has_pan())
-		set_pstate_pan(0);
-}
-
-asmlinkage noinstr unsigned long
-__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
-{
-	unsigned long ret;
-
-	/*
-	 * We didn't take an exception to get here, so the HW hasn't
-	 * set/cleared bits in PSTATE that we may rely on. Initialize PAN.
-	 */
-	__sdei_pstate_entry();
-
-	arm64_enter_nmi(regs);
-
-	ret = _sdei_handler(regs, arg);
-
-	arm64_exit_nmi(regs);
-
-	return ret;
-}
-- 
cgit v1.2.3


From 6ecbc78c3d06a3e7a4676f348a52f1c533d88464 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:22 +0100
Subject: arm64: entry: make NMI entry/exit functions static

Now that we only call arm64_enter_nmi() and arm64_exit_nmi() from within
entry-common.c, let's make these static to ensure this remains the case.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-19-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/exception.h | 2 --
 arch/arm64/kernel/entry-common.c   | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 0113b9242b67..4afbc45b8bb0 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -57,8 +57,6 @@ asmlinkage void call_on_irq_stack(struct pt_regs *regs,
 				  void (*func)(struct pt_regs *));
 asmlinkage void enter_from_user_mode(void);
 asmlinkage void exit_to_user_mode(void);
-void arm64_enter_nmi(struct pt_regs *regs);
-void arm64_exit_nmi(struct pt_regs *regs);
 void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 void do_bti(struct pt_regs *regs);
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 1b32ca3848f5..12ce14a98b7c 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -75,7 +75,7 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs)
 	}
 }
 
-void noinstr arm64_enter_nmi(struct pt_regs *regs)
+static void noinstr arm64_enter_nmi(struct pt_regs *regs)
 {
 	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
 
@@ -88,7 +88,7 @@ void noinstr arm64_enter_nmi(struct pt_regs *regs)
 	ftrace_nmi_enter();
 }
 
-void noinstr arm64_exit_nmi(struct pt_regs *regs)
+static void noinstr arm64_exit_nmi(struct pt_regs *regs)
 {
 	bool restore = regs->lockdep_hardirqs;
 
-- 
cgit v1.2.3


From bf6fa2c0dda751863c3446aa64d733013bec4a19 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:23 +0100
Subject: arm64: entry: don't instrument entry code with KCOV

The code in entry-common.c runs at exception entry and return
boundaries, where portions of the kernel environment aren't available.
For example, RCU may not be watching, and lockdep state may be
out-of-sync with the hardware. Due to this, it is not sound to
instrument this code.

We generally avoid instrumentation by marking the entry functions as
`noinstr`, but currently this doesn't inhibit KCOV instrumentation.
Prevent this by disabling KCOV for the entire compilation unit.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-20-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6cc97730790e..787c3c83edd7 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_syscall.o	 = -fstack-protector -fstack-protector-strong
 CFLAGS_syscall.o	+= -fno-stack-protector
 
+# It's not safe to invoke KCOV when portions of the kernel environment aren't
+# available or are out-of-sync with HW state. Since `noinstr` doesn't always
+# inhibit KCOV instrumentation, disable it for the entire compilation unit.
+KCOV_INSTRUMENT_entry.o := n
+
 # Object file lists.
 obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   entry-common.o entry-fpsimd.o process.o ptrace.o	\
-- 
cgit v1.2.3


From b5df5b8307b1db6d168ffac29eff3974779bb34b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Jun 2021 10:46:24 +0100
Subject: arm64: idle: don't instrument idle code with KCOV

The low-level idle code in arch_cpu_idle() and its callees runs at a
time where where portions of the kernel environment aren't available.
For example, RCU may not be watching, and lockdep state may be
out-of-sync with the hardware. Due to this, it is not sound to
instrument this code.

We generally avoid instrumentation by marking the entry functions as
`noinstr`, but currently this doesn't inhibit KCOV instrumentation.
Prevent this by factoring these functions into a new idle.c so that we
can disable KCOV for the entire compilation unit, as is done for the
core idle code in kernel/sched/idle.c.

We'd like to keep instrumentation of the rest of process.c, and for the
existing code in cpuidle.c, so a new compilation unit is preferable. The
arch_cpu_idle_dead() function in process.c is a cpu hotplug function
that is safe to instrument, so it is left as-is in process.c.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210607094624.34689-21-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/Makefile  |  3 +-
 arch/arm64/kernel/idle.c    | 69 +++++++++++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/process.c | 57 -------------------------------------
 3 files changed, 71 insertions(+), 58 deletions(-)
 create mode 100644 arch/arm64/kernel/idle.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 787c3c83edd7..de434204d723 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_syscall.o	+= -fno-stack-protector
 # available or are out-of-sync with HW state. Since `noinstr` doesn't always
 # inhibit KCOV instrumentation, disable it for the entire compilation unit.
 KCOV_INSTRUMENT_entry.o := n
+KCOV_INSTRUMENT_idle.o := n
 
 # Object file lists.
 obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
@@ -27,7 +28,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o idreg-override.o
+			   syscall.o proton-pack.o idreg-override.o idle.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c
new file mode 100644
index 000000000000..45c79204dc40
--- /dev/null
+++ b/arch/arm64/kernel/idle.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Low-level idle sequences
+ */
+
+#include <linux/cpu.h>
+#include <linux/irqflags.h>
+
+#include <asm/arch_gicv3.h>
+#include <asm/barrier.h>
+#include <asm/cpufeature.h>
+#include <asm/sysreg.h>
+
+static void noinstr __cpu_do_idle(void)
+{
+	dsb(sy);
+	wfi();
+}
+
+static void noinstr __cpu_do_idle_irqprio(void)
+{
+	unsigned long pmr;
+	unsigned long daif_bits;
+
+	daif_bits = read_sysreg(daif);
+	write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif);
+
+	/*
+	 * Unmask PMR before going idle to make sure interrupts can
+	 * be raised.
+	 */
+	pmr = gic_read_pmr();
+	gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
+
+	__cpu_do_idle();
+
+	gic_write_pmr(pmr);
+	write_sysreg(daif_bits, daif);
+}
+
+/*
+ *	cpu_do_idle()
+ *
+ *	Idle the processor (wait for interrupt).
+ *
+ *	If the CPU supports priority masking we must do additional work to
+ *	ensure that interrupts are not masked at the PMR (because the core will
+ *	not wake up if we block the wake up signal in the interrupt controller).
+ */
+void noinstr cpu_do_idle(void)
+{
+	if (system_uses_irq_prio_masking())
+		__cpu_do_idle_irqprio();
+	else
+		__cpu_do_idle();
+}
+
+/*
+ * This is our default idle handler.
+ */
+void noinstr arch_cpu_idle(void)
+{
+	/*
+	 * This should do all the clock switching and wait for interrupt
+	 * tricks
+	 */
+	cpu_do_idle();
+	raw_local_irq_enable();
+}
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 2e7337709155..72c5d80f03fa 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -73,63 +73,6 @@ EXPORT_SYMBOL_GPL(pm_power_off);
 
 void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 
-static void noinstr __cpu_do_idle(void)
-{
-	dsb(sy);
-	wfi();
-}
-
-static void noinstr __cpu_do_idle_irqprio(void)
-{
-	unsigned long pmr;
-	unsigned long daif_bits;
-
-	daif_bits = read_sysreg(daif);
-	write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif);
-
-	/*
-	 * Unmask PMR before going idle to make sure interrupts can
-	 * be raised.
-	 */
-	pmr = gic_read_pmr();
-	gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
-	__cpu_do_idle();
-
-	gic_write_pmr(pmr);
-	write_sysreg(daif_bits, daif);
-}
-
-/*
- *	cpu_do_idle()
- *
- *	Idle the processor (wait for interrupt).
- *
- *	If the CPU supports priority masking we must do additional work to
- *	ensure that interrupts are not masked at the PMR (because the core will
- *	not wake up if we block the wake up signal in the interrupt controller).
- */
-void noinstr cpu_do_idle(void)
-{
-	if (system_uses_irq_prio_masking())
-		__cpu_do_idle_irqprio();
-	else
-		__cpu_do_idle();
-}
-
-/*
- * This is our default idle handler.
- */
-void noinstr arch_cpu_idle(void)
-{
-	/*
-	 * This should do all the clock switching and wait for interrupt
-	 * tricks
-	 */
-	cpu_do_idle();
-	raw_local_irq_enable();
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 void arch_cpu_idle_dead(void)
 {
-- 
cgit v1.2.3


From 5ca54404e68de8560ca15e8d0e6b625fd05ceeaf Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Tue, 8 Jun 2021 16:48:16 +0800
Subject: perf: qcom: Remove redundant dev_err call in
 qcom_l3_cache_pmu_probe()

There is a error message within devm_ioremap_resource
already, so remove the dev_err call to avoid redundant
error message.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Link: https://lore.kernel.org/r/20210608084816.1046485-1-chenxiaosong2@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/qcom_l3_pmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
index 081273543c6b..c76f6f21d2a8 100644
--- a/drivers/perf/qcom_l3_pmu.c
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -767,10 +767,8 @@ static int qcom_l3_cache_pmu_probe(struct platform_device *pdev)
 
 	memrc = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	l3pmu->regs = devm_ioremap_resource(&pdev->dev, memrc);
-	if (IS_ERR(l3pmu->regs)) {
-		dev_err(&pdev->dev, "Can't map PMU @%pa\n", &memrc->start);
+	if (IS_ERR(l3pmu->regs))
 		return PTR_ERR(l3pmu->regs);
-	}
 
 	qcom_l3_cache__init(l3pmu);
 
-- 
cgit v1.2.3


From 59d697a99daa4723b62f9b07f41191cca1e44f3f Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Sun, 6 Jun 2021 00:15:14 +0200
Subject: perf/hisi: Constify static attribute_group structs

These are only put in an array of pointers to const attribute_group
structs. Make them const like the other static attribute_group structs
to allow the compiler to put them in read-only memory.

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Link: https://lore.kernel.org/r/20210605221514.73449-1-rikard.falkeborn@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c   | 2 +-
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index e1f71eab5640..83264ec0a957 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -333,7 +333,7 @@ static struct attribute *hisi_pa_pmu_identifier_attrs[] = {
 	NULL
 };
 
-static struct attribute_group hisi_pa_pmu_identifier_group = {
+static const struct attribute_group hisi_pa_pmu_identifier_group = {
 	.attrs = hisi_pa_pmu_identifier_attrs,
 };
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 08e028d9a406..6aedc303ff56 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -366,7 +366,7 @@ static struct attribute *hisi_sllc_pmu_identifier_attrs[] = {
 	NULL
 };
 
-static struct attribute_group hisi_sllc_pmu_identifier_group = {
+static const struct attribute_group hisi_sllc_pmu_identifier_group = {
 	.attrs = hisi_sllc_pmu_identifier_attrs,
 };
 
-- 
cgit v1.2.3


From 27f2a4db76e8d8a8b601fc1c6a7a17f88bd907ab Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 21 May 2021 18:26:24 -0700
Subject: Makefile: fix GDB warning with CONFIG_RELR

GDB produces the following warning when debugging kernels built with
CONFIG_RELR:

BFD: /android0/linux-next/vmlinux: unknown type [0x13] section `.relr.dyn'

when loading a kernel built with CONFIG_RELR into GDB. It can also
prevent debugging symbols using such relocations.

Peter sugguests:
  [That flag] means that lld will use dynamic tags and section type
  numbers in the OS-specific range rather than the generic range. The
  kernel itself doesn't care about these numbers; it determines the
  location of the RELR section using symbols defined by a linker script.

Link: https://github.com/ClangBuiltLinux/linux/issues/1057
Suggested-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20210522012626.2811297-1-ndesaulniers@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Makefile                      | 2 +-
 scripts/tools-support-relr.sh | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index e4468353425a..e38c74d0433c 100644
--- a/Makefile
+++ b/Makefile
@@ -1031,7 +1031,7 @@ LDFLAGS_vmlinux	+= $(call ld-option, -X,)
 endif
 
 ifeq ($(CONFIG_RELR),y)
-LDFLAGS_vmlinux	+= --pack-dyn-relocs=relr
+LDFLAGS_vmlinux	+= --pack-dyn-relocs=relr --use-android-relr-tags
 endif
 
 # We never want expected sections to be placed heuristically by the
diff --git a/scripts/tools-support-relr.sh b/scripts/tools-support-relr.sh
index 45e8aa360b45..cb55878bd5b8 100755
--- a/scripts/tools-support-relr.sh
+++ b/scripts/tools-support-relr.sh
@@ -7,7 +7,8 @@ trap "rm -f $tmp_file.o $tmp_file $tmp_file.bin" EXIT
 cat << "END" | $CC -c -x c - -o $tmp_file.o >/dev/null 2>&1
 void *p = &p;
 END
-$LD $tmp_file.o -shared -Bsymbolic --pack-dyn-relocs=relr -o $tmp_file
+$LD $tmp_file.o -shared -Bsymbolic --pack-dyn-relocs=relr \
+  --use-android-relr-tags -o $tmp_file
 
 # Despite printing an error message, GNU nm still exits with exit code 0 if it
 # sees a relr section. So we need to check that nothing is printed to stderr.
-- 
cgit v1.2.3


From cfa7ff959a789a953eac40c8ac793e2cfc2db931 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 3 Jun 2021 19:41:18 +0100
Subject: arm64: smccc: Support SMCCC v1.3 SVE register saving hint

SMCCC v1.2 requires that all SVE state be preserved over SMC calls which
introduces substantial overhead in the common case where there is no SVE
state in the registers. To avoid this SMCCC v1.3 introduces a flag which
allows the caller to say that there is no state that needs to be preserved
in the registers. Make use of this flag, setting it if the SMCCC version
indicates support for it and the TIF_ flags indicate that there is no live
SVE state in the registers, this avoids placing any constraints on when
SMCCC calls can be done or triggering extra saving and reloading of SVE
register state in the kernel.

This would be straightforward enough except for the rather entertaining
inline assembly we use to do SMCCC v1.1 calls to allow us to take advantage
of the limited number of registers it clobbers. Deal with this by having a
function which we call immediately before issuing the SMCCC call to make
our checks and set the flag. Using alternatives the overhead if SVE is
supported but not detected at runtime can be reduced to a single NOP.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210603184118.15090-1-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/smccc-call.S | 26 ++++++++++++++++++++++++++
 drivers/firmware/smccc/smccc.c |  4 ++++
 include/linux/arm-smccc.h      | 33 +++++++++++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index 2def9d0dd3dd..d3d37f932b97 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -7,8 +7,34 @@
 
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
+#include <asm/thread_info.h>
+
+/*
+ * If we have SMCCC v1.3 and (as is likely) no SVE state in
+ * the registers then set the SMCCC hint bit to say there's no
+ * need to preserve it.  Do this by directly adjusting the SMCCC
+ * function value which is already stored in x0 ready to be called.
+ */
+SYM_FUNC_START(__arm_smccc_sve_check)
+
+	ldr_l	x16, smccc_has_sve_hint
+	cbz	x16, 2f
+
+	get_current_task x16
+	ldr	x16, [x16, #TSK_TI_FLAGS]
+	tbnz	x16, #TIF_FOREIGN_FPSTATE, 1f	// Any live FP state?
+	tbnz	x16, #TIF_SVE, 2f		// Does that state include SVE?
+
+1:	orr	x0, x0, ARM_SMCCC_1_3_SVE_HINT
+
+2:	ret
+SYM_FUNC_END(__arm_smccc_sve_check)
+EXPORT_SYMBOL(__arm_smccc_sve_check)
 
 	.macro SMCCC instr
+alternative_if ARM64_SVE
+	bl	__arm_smccc_sve_check
+alternative_else_nop_endif
 	\instr	#0
 	ldr	x4, [sp]
 	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index 028f81d702cc..9f937b125ab0 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -15,6 +15,7 @@ static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
 static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
 
 bool __ro_after_init smccc_trng_available = false;
+u64 __ro_after_init smccc_has_sve_hint = false;
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 {
@@ -22,6 +23,9 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 	smccc_conduit = conduit;
 
 	smccc_trng_available = smccc_probe_trng();
+	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
+	    smccc_version >= ARM_SMCCC_VERSION_1_3)
+		smccc_has_sve_hint = true;
 }
 
 enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void)
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 5cef2b8b0479..7d1cabe15262 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -63,6 +63,9 @@
 #define ARM_SMCCC_VERSION_1_0		0x10000
 #define ARM_SMCCC_VERSION_1_1		0x10001
 #define ARM_SMCCC_VERSION_1_2		0x10002
+#define ARM_SMCCC_VERSION_1_3		0x10003
+
+#define ARM_SMCCC_1_3_SVE_HINT		0x10000
 
 #define ARM_SMCCC_VERSION_FUNC_ID					\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
@@ -216,6 +219,8 @@ u32 arm_smccc_get_version(void);
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit);
 
+extern u64 smccc_has_sve_hint;
+
 /**
  * struct arm_smccc_res - Result from SMC/HVC call
  * @a0-a3 result values from registers 0 to 3
@@ -295,6 +300,15 @@ struct arm_smccc_quirk {
 	} state;
 };
 
+/**
+ * __arm_smccc_sve_check() - Set the SVE hint bit when doing SMC calls
+ *
+ * Sets the SMCCC hint bit to indicate if there is live state in the SVE
+ * registers, this modifies x0 in place and should never be called from C
+ * code.
+ */
+asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0);
+
 /**
  * __arm_smccc_smc() - make SMC calls
  * @a0-a7: arguments passed in registers 0 to 7
@@ -352,6 +366,20 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #endif
 
+/* nVHE hypervisor doesn't have a current thread so needs separate checks */
+#if defined(CONFIG_ARM64_SVE) && !defined(__KVM_NVHE_HYPERVISOR__)
+
+#define SMCCC_SVE_CHECK ALTERNATIVE("nop \n",  "bl __arm_smccc_sve_check \n", \
+				    ARM64_SVE)
+#define smccc_sve_clobbers "x16", "x30", "cc",
+
+#else
+
+#define SMCCC_SVE_CHECK
+#define smccc_sve_clobbers
+
+#endif
+
 #define ___count_args(_0, _1, _2, _3, _4, _5, _6, _7, _8, x, ...) x
 
 #define __count_args(...)						\
@@ -419,7 +447,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #define ___constraints(count)						\
 	: __constraint_read_ ## count					\
-	: "memory"
+	: smccc_sve_clobbers "memory"
 #define __constraints(count)	___constraints(count)
 
 /*
@@ -434,7 +462,8 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		register unsigned long r2 asm("r2");			\
 		register unsigned long r3 asm("r3"); 			\
 		__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__);	\
-		asm volatile(inst "\n" :				\
+		asm volatile(SMCCC_SVE_CHECK				\
+			     inst "\n" :				\
 			     "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3)	\
 			     __constraints(__count_args(__VA_ARGS__)));	\
 		if (___res)						\
-- 
cgit v1.2.3


From e0e3903f83d5e41ab7e7737ebe41ef36f578dc0a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 8 Jun 2021 13:37:42 +0100
Subject: arm64: mm: decode xFSC in mem_abort_decode()

It would be helpful if mem_abort_decode() could decode the DFSC/IFSC, as
this can make it easier to identify common bugs (e.g. accesses which
trigger alignment faults) without having to manually decode the xFSC
value.

Decode the xFSC in mem_abort_decode().

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210608123742.11921-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/fault.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 5c855b2ab93b..6786cf152666 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -99,6 +99,8 @@ static void mem_abort_decode(unsigned int esr)
 	pr_alert("  EA = %lu, S1PTW = %lu\n",
 		 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
 		 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
+	pr_alert("  FSC = 0x%02x: %s\n", (esr & ESR_ELx_FSC),
+		 esr_to_fault_info(esr)->name);
 
 	if (esr_is_data_abort(esr))
 		data_abort_decode(esr);
-- 
cgit v1.2.3


From 4c1daba15c209b99d192f147fea3dade30f72ed2 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 8 Jun 2021 12:55:12 +0100
Subject: perf/smmuv3: Don't trample existing events with global filter

With global filtering, we only allow an event to be scheduled if its
filter settings exactly match those of any existing events, therefore
it is pointless to reapply the filter in that case. Much worse, though,
is that in doing that we trample the event type of counter 0 if it's
already active, and never touch the appropriate PMEVTYPERn so the new
event is likely not counting the right thing either. Don't do that.

CC: stable@vger.kernel.org
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/32c80c0e46237f49ad8da0c9f8864e13c4a803aa.1623153312.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 7786ccc6d12f..c195a9adec32 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -277,7 +277,7 @@ static int smmu_pmu_apply_event_filter(struct smmu_pmu *smmu_pmu,
 				       struct perf_event *event, int idx)
 {
 	u32 span, sid;
-	unsigned int num_ctrs = smmu_pmu->num_counters;
+	unsigned int cur_idx, num_ctrs = smmu_pmu->num_counters;
 	bool filter_en = !!get_filter_enable(event);
 
 	span = filter_en ? get_filter_span(event) :
@@ -285,17 +285,19 @@ static int smmu_pmu_apply_event_filter(struct smmu_pmu *smmu_pmu,
 	sid = filter_en ? get_filter_stream_id(event) :
 			   SMMU_PMCG_DEFAULT_FILTER_SID;
 
-	/* Support individual filter settings */
-	if (!smmu_pmu->global_filter) {
+	cur_idx = find_first_bit(smmu_pmu->used_counters, num_ctrs);
+	/*
+	 * Per-counter filtering, or scheduling the first globally-filtered
+	 * event into an empty PMU so idx == 0 and it works out equivalent.
+	 */
+	if (!smmu_pmu->global_filter || cur_idx == num_ctrs) {
 		smmu_pmu_set_event_filter(event, idx, span, sid);
 		return 0;
 	}
 
-	/* Requested settings same as current global settings*/
-	idx = find_first_bit(smmu_pmu->used_counters, num_ctrs);
-	if (idx == num_ctrs ||
-	    smmu_pmu_check_global_filter(smmu_pmu->events[idx], event)) {
-		smmu_pmu_set_event_filter(event, 0, span, sid);
+	/* Otherwise, must match whatever's currently scheduled */
+	if (smmu_pmu_check_global_filter(smmu_pmu->events[cur_idx], event)) {
+		smmu_pmu_set_evtyper(smmu_pmu, idx, get_event(event));
 		return 0;
 	}
 
-- 
cgit v1.2.3


From f8e6d24144d1bfbb8714faa9044e135c0c00bd89 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:40:57 +0800
Subject: perf: Add EVENT_ATTR_ID to simplify event attributes

Similar EVENT_ATTR macros are defined in many PMU drivers,
like Arm PMU driver, Arm SMMU PMU driver. So add a generic
macro to simplify code.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-2-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf_event.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f5a6a2f069ed..2d510ad750ed 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1576,6 +1576,12 @@ static struct perf_pmu_events_attr _var = {				    \
 	.event_str	= _str,						    \
 };
 
+#define PMU_EVENT_ATTR_ID(_name, _show, _id)				\
+	(&((struct perf_pmu_events_attr[]) {				\
+		{ .attr = __ATTR(_name, 0444, _show, NULL),		\
+		  .id = _id, }						\
+	})[0].attr.attr)
+
 #define PMU_FORMAT_ATTR(_name, _format)					\
 static ssize_t								\
 _name##_show(struct device *dev,					\
-- 
cgit v1.2.3


From 7ac87a8dfbd9c42fa1920773b09a57586222aad4 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:40:58 +0800
Subject: drivers/perf: Simplify EVENT ATTR macro in SMMU PMU driver

Use common macro PMU_EVENT_ATTR_ID to simplify SMMU_EVENT_ATTR

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-3-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index c195a9adec32..226348822ab3 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -511,11 +511,8 @@ static ssize_t smmu_pmu_event_show(struct device *dev,
 	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
 }
 
-#define SMMU_EVENT_ATTR(name, config)					\
-	(&((struct perf_pmu_events_attr) {				\
-		.attr = __ATTR(name, 0444, smmu_pmu_event_show, NULL),	\
-		.id = config,						\
-	}).attr.attr)
+#define SMMU_EVENT_ATTR(name, config)			\
+	PMU_EVENT_ATTR_ID(name, smmu_pmu_event_show, config)
 
 static struct attribute *smmu_pmu_events[] = {
 	SMMU_EVENT_ATTR(cycles, 0),
-- 
cgit v1.2.3


From 0bf2d7298842afbc28a5413024ebc444a599e980 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:40:59 +0800
Subject: drivers/perf: Simplify EVENT ATTR macro in qcom_l2_pmu.c

Use common macro PMU_EVENT_ATTR_ID to simplify L2CACHE_EVENT_ATTR

Cc: Andy Gross <agross@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-4-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/qcom_l2_pmu.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index b60e30141583..5b093badd0f6 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -679,11 +679,8 @@ static ssize_t l2cache_pmu_event_show(struct device *dev,
 	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
 }
 
-#define L2CACHE_EVENT_ATTR(_name, _id)					     \
-	(&((struct perf_pmu_events_attr[]) {				     \
-		{ .attr = __ATTR(_name, 0444, l2cache_pmu_event_show, NULL), \
-		  .id = _id, }						     \
-	})[0].attr.attr)
+#define L2CACHE_EVENT_ATTR(_name, _id)			    \
+	PMU_EVENT_ATTR_ID(_name, l2cache_pmu_event_show, _id)
 
 static struct attribute *l2_cache_pmu_events[] = {
 	L2CACHE_EVENT_ATTR(cycles, L2_EVENT_CYCLES),
-- 
cgit v1.2.3


From 78b1d3c72070bbc9793e63dd6528c1e67ee0d52a Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:41:00 +0800
Subject: drivers/perf: Simplify EVENT ATTR macro in qcom_l3_pmu.c

Use common macro PMU_EVENT_ATTR_ID to simplify L3CACHE_EVENT_ATTR

Cc: Andy Gross <agross@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-5-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/qcom_l3_pmu.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
index c76f6f21d2a8..1ff2ff6582bf 100644
--- a/drivers/perf/qcom_l3_pmu.c
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -647,10 +647,7 @@ static ssize_t l3cache_pmu_event_show(struct device *dev,
 }
 
 #define L3CACHE_EVENT_ATTR(_name, _id)					     \
-	(&((struct perf_pmu_events_attr[]) {				     \
-		{ .attr = __ATTR(_name, 0444, l3cache_pmu_event_show, NULL), \
-		  .id = _id, }						     \
-	})[0].attr.attr)
+	PMU_EVENT_ATTR_ID(_name, l3cache_pmu_event_show, _id)
 
 static struct attribute *qcom_l3_cache_pmu_events[] = {
 	L3CACHE_EVENT_ATTR(cycles, L3_EVENT_CYCLES),
-- 
cgit v1.2.3


From b323dfe02e56627e4eaed7cf59dc609da67a1651 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:41:01 +0800
Subject: drivers/perf: Simplify EVENT ATTR macro in xgene_pmu.c

Use common macro PMU_EVENT_ATTR_ID to simplify XGENE_PMU_EVENT_ATTR

Cc: Khuong Dinh <khuong@os.amperecomputing.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-6-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/xgene_pmu.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index 62d942534a6b..2b6d476bd213 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -278,17 +278,14 @@ static const struct attribute_group mc_pmu_v3_format_attr_group = {
 static ssize_t xgene_pmu_event_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
-	struct dev_ext_attribute *eattr;
+	struct perf_pmu_events_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_attr, attr);
 
-	eattr = container_of(attr, struct dev_ext_attribute, attr);
-	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long) eattr->var);
+	return sysfs_emit(buf, "config=0x%llx\n", pmu_attr->id);
 }
 
 #define XGENE_PMU_EVENT_ATTR(_name, _config)		\
-	(&((struct dev_ext_attribute[]) {		\
-		{ .attr = __ATTR(_name, S_IRUGO, xgene_pmu_event_show, NULL), \
-		  .var = (void *) _config, }		\
-	 })[0].attr.attr)
+	PMU_EVENT_ATTR_ID(_name, xgene_pmu_event_show, _config)
 
 static struct attribute *l3c_pmu_events_attrs[] = {
 	XGENE_PMU_EVENT_ATTR(cycle-count,			0x00),
-- 
cgit v1.2.3


From 773510f4d2775bda7cec585e8643f4269c4944e5 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:41:02 +0800
Subject: drivers/perf: Simplify EVENT ATTR macro in fsl_imx8_ddr_perf.c

Use common macro PMU_EVENT_ATTR_ID to simplify IMX8_DDR_PMU_EVENT_ATTR

Reviewed by Frank Li <Frank .li@nxp.com>

Cc: Frank Li <Frank.li@nxp.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-7-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/fsl_imx8_ddr_perf.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index df048fe42fc2..2a1d78794a4e 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -222,11 +222,8 @@ ddr_pmu_event_show(struct device *dev, struct device_attribute *attr,
 	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
 }
 
-#define IMX8_DDR_PMU_EVENT_ATTR(_name, _id)				\
-	(&((struct perf_pmu_events_attr[]) {				\
-		{ .attr = __ATTR(_name, 0444, ddr_pmu_event_show, NULL),\
-		  .id = _id, }						\
-	})[0].attr.attr)
+#define IMX8_DDR_PMU_EVENT_ATTR(_name, _id)		\
+	PMU_EVENT_ATTR_ID(_name, ddr_pmu_event_show, _id)
 
 static struct attribute *ddr_perf_events_attrs[] = {
 	IMX8_DDR_PMU_EVENT_ATTR(cycles, EVENT_CYCLES_ID),
-- 
cgit v1.2.3


From 64432f09068a0fa76f20918a3c22ee3484a3762d Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:41:03 +0800
Subject: arm64: perf: Simplify EVENT ATTR macro in perf_event.c

Use common macro PMU_EVENT_ATTR_ID to simplify ARMV8_EVENT_ATTR

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-8-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index a661010308c0..d07788dad388 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -165,10 +165,7 @@ armv8pmu_events_sysfs_show(struct device *dev,
 }
 
 #define ARMV8_EVENT_ATTR(name, config)						\
-	(&((struct perf_pmu_events_attr) {					\
-		.attr = __ATTR(name, 0444, armv8pmu_events_sysfs_show, NULL),	\
-		.id = config,							\
-	}).attr.attr)
+	PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config)
 
 static struct attribute *armv8_pmuv3_event_attrs[] = {
 	ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR),
-- 
cgit v1.2.3


From 78b92c7337e10519312e8aab64d7a1651206bd61 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Jun 2021 11:23:00 +0100
Subject: arm64: insn: decouple patching from insn code

Currently, <asm/insn.h> includes <asm/patching.h>. We intend that
<asm/insn.h> will be usable from userspace, so it doesn't make sense to
include headers for kernel-only features such as the patching routines,
and we'd intended to restrict <asm/insn.h> to instruction encoding
details.

Let's decouple the patching code from <asm/insn.h>, and explicitly
include <asm/patching.h> where it is needed. Since <asm/patching.h>
isn't included from assembly, we can drop the __ASSEMBLY__ guards.

At the same time, sort the kprobes includes so that it's easier to see
what is and isn't incldued.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210609102301.17332-2-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/insn.h      |  1 -
 arch/arm64/include/asm/patching.h  |  2 --
 arch/arm64/kernel/ftrace.c         |  1 +
 arch/arm64/kernel/jump_label.c     |  1 +
 arch/arm64/kernel/kgdb.c           |  1 +
 arch/arm64/kernel/patching.c       |  1 +
 arch/arm64/kernel/probes/kprobes.c | 18 ++++++++++--------
 arch/arm64/kernel/traps.c          |  1 +
 8 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 1ea9611545bb..a6f3f45fc46f 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 
 #include <asm/alternative.h>
-#include <asm/patching.h>
 
 #ifndef __ASSEMBLY__
 /*
diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h
index 5ebab129222f..6bf5adc56295 100644
--- a/arch/arm64/include/asm/patching.h
+++ b/arch/arm64/include/asm/patching.h
@@ -4,12 +4,10 @@
 
 #include <linux/types.h>
 
-#ifndef __ASSEMBLY__
 int aarch64_insn_read(void *addr, u32 *insnp);
 int aarch64_insn_write(void *addr, u32 insn);
 
 int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
 int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
-#endif /* __ASSEMBLY__ */
 
 #endif	/* __ASM_PATCHING_H */
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index b5d3ddaf69d9..7f467bd9db7a 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -15,6 +15,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /*
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 9a8a0ae1e75f..fc98037e1220 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/jump_label.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 1a157ca33262..2aede780fb80 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -17,6 +17,7 @@
 
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 #include <asm/traps.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
index 9d050e33901b..7aa55b33c8c7 100644
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@@ -9,6 +9,7 @@
 #include <asm/cacheflush.h>
 #include <asm/fixmap.h>
 #include <asm/kprobes.h>
+#include <asm/patching.h>
 #include <asm/sections.h>
 
 static DEFINE_RAW_SPINLOCK(patch_lock);
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index d607c9912025..609edde7a5dd 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -7,26 +7,28 @@
  * Copyright (C) 2013 Linaro Limited.
  * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org>
  */
+#include <linux/extable.h>
 #include <linux/kasan.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
-#include <linux/extable.h>
-#include <linux/slab.h>
-#include <linux/stop_machine.h>
 #include <linux/sched/debug.h>
 #include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
 #include <linux/stringify.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
-#include <asm/traps.h>
-#include <asm/ptrace.h>
+
 #include <asm/cacheflush.h>
-#include <asm/debug-monitors.h>
 #include <asm/daifflags.h>
-#include <asm/system_misc.h>
+#include <asm/debug-monitors.h>
 #include <asm/insn.h>
-#include <linux/uaccess.h>
 #include <asm/irq.h>
+#include <asm/patching.h>
+#include <asm/ptrace.h>
 #include <asm/sections.h>
+#include <asm/system_misc.h>
+#include <asm/traps.h>
 
 #include "decode-insn.h"
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 9b683b2381cf..48ff6fb888e0 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -37,6 +37,7 @@
 #include <asm/exception.h>
 #include <asm/extable.h>
 #include <asm/kprobes.h>
+#include <asm/patching.h>
 #include <asm/traps.h>
 #include <asm/smp.h>
 #include <asm/stack_pointer.h>
-- 
cgit v1.2.3


From 3e00e39d9dad48360ebd518726ebf81da1b84c10 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Jun 2021 11:23:01 +0100
Subject: arm64: insn: move AARCH64_INSN_SIZE into <asm/insn.h>

For histroical reasons, we define AARCH64_INSN_SIZE in
<asm/alternative-macros.h>, but it would make more sense to do so in
<asm/insn.h>. Let's move it into <asm/insn.h>, and add the necessary
include directives for this.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210609102301.17332-3-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/alternative-macros.h | 4 +---
 arch/arm64/include/asm/insn.h               | 3 +++
 arch/arm64/include/asm/kvm_asm.h            | 1 +
 arch/arm64/kernel/cpufeature.c              | 1 +
 arch/arm64/kernel/patching.c                | 1 +
 arch/arm64/kernel/traps.c                   | 1 +
 arch/arm64/net/bpf_jit_comp.c               | 1 +
 7 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 8a078fc662ac..703fbf310b79 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -3,12 +3,10 @@
 #define __ASM_ALTERNATIVE_MACROS_H
 
 #include <asm/cpucaps.h>
+#include <asm/insn.h>
 
 #define ARM64_CB_PATCH ARM64_NCAPS
 
-/* A64 instructions are always 32 bits. */
-#define	AARCH64_INSN_SIZE		4
-
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index a6f3f45fc46f..1430b4973039 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -12,6 +12,9 @@
 
 #include <asm/alternative.h>
 
+/* A64 instructions are always 32 bits. */
+#define	AARCH64_INSN_SIZE		4
+
 #ifndef __ASSEMBLY__
 /*
  * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index cf8df032b9c3..894edda8cc85 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -8,6 +8,7 @@
 #define __ARM_KVM_ASM_H__
 
 #include <asm/hyp_image.h>
+#include <asm/insn.h>
 #include <asm/virt.h>
 
 #define ARM_EXIT_WITH_SERROR_BIT  31
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index efed2830d141..16d35cfffcea 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -76,6 +76,7 @@
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
 #include <asm/fpsimd.h>
+#include <asm/insn.h>
 #include <asm/kvm_host.h>
 #include <asm/mmu_context.h>
 #include <asm/mte.h>
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
index 7aa55b33c8c7..9a6edb9c48c7 100644
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@@ -8,6 +8,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/fixmap.h>
+#include <asm/insn.h>
 #include <asm/kprobes.h>
 #include <asm/patching.h>
 #include <asm/sections.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 48ff6fb888e0..8f66072fa5cb 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -36,6 +36,7 @@
 #include <asm/esr.h>
 #include <asm/exception.h>
 #include <asm/extable.h>
+#include <asm/insn.h>
 #include <asm/kprobes.h>
 #include <asm/patching.h>
 #include <asm/traps.h>
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index f7b194878a99..dd5000da18b8 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -16,6 +16,7 @@
 #include <asm/byteorder.h>
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
+#include <asm/insn.h>
 #include <asm/set_memory.h>
 
 #include "bpf_jit.h"
-- 
cgit v1.2.3


From 930a58b4093ebd2a036a0d448a2047477ef90d26 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 8 Jun 2021 19:02:54 +0100
Subject: arm64: cpuinfo: Split AArch32 registers out into a separate struct

In preparation for late initialisation of the "sanitised" AArch32 register
state, move the AArch32 registers out of 'struct cpuinfo' and into their
own struct definition.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210608180313.11502-2-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpu.h   | 46 ++++++++++++++------------
 arch/arm64/kernel/cpufeature.c | 74 ++++++++++++++++++++++--------------------
 arch/arm64/kernel/cpuinfo.c    | 53 ++++++++++++++++--------------
 3 files changed, 92 insertions(+), 81 deletions(-)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 9088e72c7cf6..0f6d16faa540 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -12,27 +12,7 @@
 /*
  * Records attributes of an individual CPU.
  */
-struct cpuinfo_arm64 {
-	struct cpu	cpu;
-	struct kobject	kobj;
-	u64		reg_ctr;
-	u64		reg_cntfrq;
-	u64		reg_dczid;
-	u64		reg_midr;
-	u64		reg_revidr;
-	u64		reg_gmid;
-
-	u64		reg_id_aa64dfr0;
-	u64		reg_id_aa64dfr1;
-	u64		reg_id_aa64isar0;
-	u64		reg_id_aa64isar1;
-	u64		reg_id_aa64mmfr0;
-	u64		reg_id_aa64mmfr1;
-	u64		reg_id_aa64mmfr2;
-	u64		reg_id_aa64pfr0;
-	u64		reg_id_aa64pfr1;
-	u64		reg_id_aa64zfr0;
-
+struct cpuinfo_32bit {
 	u32		reg_id_dfr0;
 	u32		reg_id_dfr1;
 	u32		reg_id_isar0;
@@ -55,6 +35,30 @@ struct cpuinfo_arm64 {
 	u32		reg_mvfr0;
 	u32		reg_mvfr1;
 	u32		reg_mvfr2;
+};
+
+struct cpuinfo_arm64 {
+	struct cpu	cpu;
+	struct kobject	kobj;
+	u64		reg_ctr;
+	u64		reg_cntfrq;
+	u64		reg_dczid;
+	u64		reg_midr;
+	u64		reg_revidr;
+	u64		reg_gmid;
+
+	u64		reg_id_aa64dfr0;
+	u64		reg_id_aa64dfr1;
+	u64		reg_id_aa64isar0;
+	u64		reg_id_aa64isar1;
+	u64		reg_id_aa64mmfr0;
+	u64		reg_id_aa64mmfr1;
+	u64		reg_id_aa64mmfr2;
+	u64		reg_id_aa64pfr0;
+	u64		reg_id_aa64pfr1;
+	u64		reg_id_aa64zfr0;
+
+	struct cpuinfo_32bit	aarch32;
 
 	/* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */
 	u64		reg_zcr;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 0645300cc1a8..33e5330ab15b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -871,6 +871,31 @@ static void __init init_cpu_hwcaps_indirect_list(void)
 
 static void __init setup_boot_cpu_capabilities(void);
 
+static void __init init_32bit_cpu_features(struct cpuinfo_32bit *info)
+{
+	init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
+	init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
+	init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
+	init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
+	init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
+	init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
+	init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
+	init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
+	init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6);
+	init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
+	init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
+	init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
+	init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
+	init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4);
+	init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5);
+	init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
+	init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
+	init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2);
+	init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
+	init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
+	init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
+}
+
 void __init init_cpu_features(struct cpuinfo_arm64 *info)
 {
 	/* Before we start using the tables, make sure it is sorted */
@@ -890,29 +915,8 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
 
-	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
-		init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
-		init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
-		init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
-		init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
-		init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
-		init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
-		init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
-		init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
-		init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6);
-		init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
-		init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
-		init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
-		init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
-		init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4);
-		init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5);
-		init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
-		init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
-		init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2);
-		init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
-		init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
-		init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
-	}
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
+		init_32bit_cpu_features(&info->aarch32);
 
 	if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
 		init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr);
@@ -986,20 +990,12 @@ static void relax_cpu_ftr_reg(u32 sys_id, int field)
 	WARN_ON(!ftrp->width);
 }
 
-static int update_32bit_cpu_features(int cpu, struct cpuinfo_arm64 *info,
-				     struct cpuinfo_arm64 *boot)
+static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info,
+				     struct cpuinfo_32bit *boot)
 {
 	int taint = 0;
 	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
 
-	/*
-	 * If we don't have AArch32 at all then skip the checks entirely
-	 * as the register values may be UNKNOWN and we're not going to be
-	 * using them for anything.
-	 */
-	if (!id_aa64pfr0_32bit_el0(pfr0))
-		return taint;
-
 	/*
 	 * If we don't have AArch32 at EL1, then relax the strictness of
 	 * EL1-dependent register fields to avoid spurious sanity check fails.
@@ -1151,15 +1147,23 @@ void update_cpu_features(int cpu,
 	 * value is the same on all CPUs.
 	 */
 	if (IS_ENABLED(CONFIG_ARM64_MTE) &&
-	    id_aa64pfr1_mte(info->reg_id_aa64pfr1))
+	    id_aa64pfr1_mte(info->reg_id_aa64pfr1)) {
 		taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu,
 					      info->reg_gmid, boot->reg_gmid);
+	}
 
 	/*
+	 * If we don't have AArch32 at all then skip the checks entirely
+	 * as the register values may be UNKNOWN and we're not going to be
+	 * using them for anything.
+	 *
 	 * This relies on a sanitised view of the AArch64 ID registers
 	 * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last.
 	 */
-	taint |= update_32bit_cpu_features(cpu, info, boot);
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
+		taint |= update_32bit_cpu_features(cpu, &info->aarch32,
+						   &boot->aarch32);
+	}
 
 	/*
 	 * Mismatched CPU features are a recipe for disaster. Don't even
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 5321b8218591..87731fea5e41 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -344,6 +344,32 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
 	pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
 }
 
+static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info)
+{
+	info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
+	info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1);
+	info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
+	info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
+	info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
+	info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
+	info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
+	info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
+	info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1);
+	info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
+	info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
+	info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
+	info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
+	info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1);
+	info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1);
+	info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
+	info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
+	info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1);
+
+	info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
+	info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
+	info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+}
+
 static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 {
 	info->reg_cntfrq = arch_timer_get_cntfrq();
@@ -374,31 +400,8 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
 		info->reg_gmid = read_cpuid(GMID_EL1);
 
-	/* Update the 32bit ID registers only if AArch32 is implemented */
-	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
-		info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
-		info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1);
-		info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
-		info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
-		info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
-		info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
-		info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
-		info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
-		info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1);
-		info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
-		info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
-		info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
-		info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
-		info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1);
-		info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1);
-		info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
-		info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
-		info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1);
-
-		info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
-		info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
-		info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
-	}
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
+		__cpuinfo_store_cpu_32bit(&info->aarch32);
 
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(info->reg_id_aa64pfr0))
-- 
cgit v1.2.3


From 2122a833316f2f3f6ddc78429fa67ef6d3c86636 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 8 Jun 2021 19:02:55 +0100
Subject: arm64: Allow mismatched 32-bit EL0 support

When confronted with a mixture of CPUs, some of which support 32-bit
applications and others which don't, we quite sensibly treat the system
as 64-bit only for userspace and prevent execve() of 32-bit binaries.

Unfortunately, some crazy folks have decided to build systems like this
with the intention of running 32-bit applications, so relax our
sanitisation logic to continue to advertise 32-bit support to userspace
on these systems and track the real 32-bit capable cores in a cpumask
instead. For now, the default behaviour remains but will be tied to
a command-line option in a later patch.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210608180313.11502-3-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpufeature.h |   8 ++-
 arch/arm64/kernel/cpufeature.c      | 114 ++++++++++++++++++++++++++++++++----
 arch/arm64/tools/cpucaps            |   3 +-
 3 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 650de920e067..9bb9d11750d7 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -637,9 +637,15 @@ static inline bool cpu_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
 }
 
+const struct cpumask *system_32bit_el0_cpumask(void);
+DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
+
 static inline bool system_supports_32bit_el0(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_32BIT_EL0);
+	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+	return static_branch_unlikely(&arm64_mismatched_32bit_el0) ||
+	       id_aa64pfr0_32bit_el0(pfr0);
 }
 
 static inline bool system_supports_4kb_granule(void)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 33e5330ab15b..52389018ff33 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -107,6 +107,24 @@ DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE);
 bool arm64_use_ng_mappings = false;
 EXPORT_SYMBOL(arm64_use_ng_mappings);
 
+/*
+ * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs
+ * support it?
+ */
+static bool __read_mostly allow_mismatched_32bit_el0;
+
+/*
+ * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have
+ * seen at least one CPU capable of 32-bit EL0.
+ */
+DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
+
+/*
+ * Mask of CPUs supporting 32-bit EL0.
+ * Only valid if arm64_mismatched_32bit_el0 is enabled.
+ */
+static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly;
+
 /*
  * Flag to indicate if we have computed the system wide
  * capabilities based on the boot time active CPUs. This
@@ -775,7 +793,7 @@ static void __init sort_ftr_regs(void)
  * Any bits that are not covered by an arm64_ftr_bits entry are considered
  * RES0 for the system-wide value, and must strictly match.
  */
-static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new)
+static void init_cpu_ftr_reg(u32 sys_reg, u64 new)
 {
 	u64 val = 0;
 	u64 strict_mask = ~0x0ULL;
@@ -871,7 +889,7 @@ static void __init init_cpu_hwcaps_indirect_list(void)
 
 static void __init setup_boot_cpu_capabilities(void);
 
-static void __init init_32bit_cpu_features(struct cpuinfo_32bit *info)
+static void init_32bit_cpu_features(struct cpuinfo_32bit *info)
 {
 	init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
 	init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
@@ -990,6 +1008,22 @@ static void relax_cpu_ftr_reg(u32 sys_id, int field)
 	WARN_ON(!ftrp->width);
 }
 
+static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info,
+					 struct cpuinfo_arm64 *boot)
+{
+	static bool boot_cpu_32bit_regs_overridden = false;
+
+	if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden)
+		return;
+
+	if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0))
+		return;
+
+	boot->aarch32 = info->aarch32;
+	init_32bit_cpu_features(&boot->aarch32);
+	boot_cpu_32bit_regs_overridden = true;
+}
+
 static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info,
 				     struct cpuinfo_32bit *boot)
 {
@@ -1161,6 +1195,7 @@ void update_cpu_features(int cpu,
 	 * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last.
 	 */
 	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
+		lazy_init_32bit_cpu_features(info, boot);
 		taint |= update_32bit_cpu_features(cpu, &info->aarch32,
 						   &boot->aarch32);
 	}
@@ -1273,6 +1308,28 @@ has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
 	return feature_matches(val, entry);
 }
 
+const struct cpumask *system_32bit_el0_cpumask(void)
+{
+	if (!system_supports_32bit_el0())
+		return cpu_none_mask;
+
+	if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
+		return cpu_32bit_el0_mask;
+
+	return cpu_possible_mask;
+}
+
+static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	if (!has_cpuid_feature(entry, scope))
+		return allow_mismatched_32bit_el0;
+
+	if (scope == SCOPE_SYSTEM)
+		pr_info("detected: 32-bit EL0 Support\n");
+
+	return true;
+}
+
 static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope)
 {
 	bool has_sre;
@@ -1891,10 +1948,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_copy_el2regs,
 	},
 	{
-		.desc = "32-bit EL0 Support",
-		.capability = ARM64_HAS_32BIT_EL0,
+		.capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE,
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
-		.matches = has_cpuid_feature,
+		.matches = has_32bit_el0,
 		.sys_reg = SYS_ID_AA64PFR0_EL1,
 		.sign = FTR_UNSIGNED,
 		.field_pos = ID_AA64PFR0_EL0_SHIFT,
@@ -2403,7 +2459,7 @@ static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = {
 	{},
 };
 
-static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
+static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 {
 	switch (cap->hwcap_type) {
 	case CAP_HWCAP:
@@ -2448,7 +2504,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 	return rc;
 }
 
-static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
+static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
 {
 	/* We support emulation of accesses to CPU ID feature registers */
 	cpu_set_named_feature(CPUID);
@@ -2623,7 +2679,7 @@ static void check_early_cpu_features(void)
 }
 
 static void
-verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
+__verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
 {
 
 	for (; caps->matches; caps++)
@@ -2634,6 +2690,14 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
 		}
 }
 
+static void verify_local_elf_hwcaps(void)
+{
+	__verify_local_elf_hwcaps(arm64_elf_hwcaps);
+
+	if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1)))
+		__verify_local_elf_hwcaps(compat_elf_hwcaps);
+}
+
 static void verify_sve_features(void)
 {
 	u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
@@ -2698,11 +2762,7 @@ static void verify_local_cpu_capabilities(void)
 	 * on all secondary CPUs.
 	 */
 	verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU);
-
-	verify_local_elf_hwcaps(arm64_elf_hwcaps);
-
-	if (system_supports_32bit_el0())
-		verify_local_elf_hwcaps(compat_elf_hwcaps);
+	verify_local_elf_hwcaps();
 
 	if (system_supports_sve())
 		verify_sve_features();
@@ -2837,6 +2897,34 @@ void __init setup_cpu_features(void)
 			ARCH_DMA_MINALIGN);
 }
 
+static int enable_mismatched_32bit_el0(unsigned int cpu)
+{
+	struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu);
+	bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0);
+
+	if (cpu_32bit) {
+		cpumask_set_cpu(cpu, cpu_32bit_el0_mask);
+		static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0);
+		setup_elf_hwcaps(compat_elf_hwcaps);
+	}
+
+	return 0;
+}
+
+static int __init init_32bit_el0_mask(void)
+{
+	if (!allow_mismatched_32bit_el0)
+		return 0;
+
+	if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				 "arm64/mismatched_32bit_el0:online",
+				 enable_mismatched_32bit_el0, NULL);
+}
+subsys_initcall_sync(init_32bit_el0_mask);
+
 static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap)
 {
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 21fbdda7086e..49305c2e6dfd 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -3,7 +3,8 @@
 # Internal CPU capabilities constants, keep this list sorted
 
 BTI
-HAS_32BIT_EL0
+# Unreliable: use system_supports_32bit_el0() instead.
+HAS_32BIT_EL0_DO_NOT_USE
 HAS_32BIT_EL1
 HAS_ADDRESS_AUTH
 HAS_ADDRESS_AUTH_ARCH
-- 
cgit v1.2.3


From 2f6a49bbc01da17867c26f6f650b1142e1d7c69d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 8 Jun 2021 19:02:56 +0100
Subject: KVM: arm64: Kill 32-bit vCPUs on systems with mismatched EL0 support

If a vCPU is caught running 32-bit code on a system with mismatched
support at EL0, then we should kill it.

Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210608180313.11502-4-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kvm/arm.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1cb39c0803a4..5bdba97a7654 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -692,6 +692,15 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
 	}
 }
 
+static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
+{
+	if (likely(!vcpu_mode_is_32bit(vcpu)))
+		return false;
+
+	return !system_supports_32bit_el0() ||
+		static_branch_unlikely(&arm64_mismatched_32bit_el0);
+}
+
 /**
  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  * @vcpu:	The VCPU pointer
@@ -875,7 +884,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		 * with the asymmetric AArch32 case), return to userspace with
 		 * a fatal error.
 		 */
-		if (!system_supports_32bit_el0() && vcpu_mode_is_32bit(vcpu)) {
+		if (vcpu_mode_is_bad_32bit(vcpu)) {
 			/*
 			 * As we have caught the guest red-handed, decide that
 			 * it isn't fit for purpose anymore by making the vcpu
-- 
cgit v1.2.3


From 873c3e89777c8c56f936ae7aceca1a102aac6b9e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 8 Jun 2021 19:02:57 +0100
Subject: arm64: Kill 32-bit applications scheduled on 64-bit-only CPUs

Scheduling a 32-bit application on a 64-bit-only CPU is a bad idea.

Ensure that 32-bit applications always take the slow-path when returning
to userspace on a system with mismatched support at EL0, so that we can
avoid trying to run on a 64-bit-only CPU and force a SIGKILL instead.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210608180313.11502-5-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/process.c | 19 ++++++++++++++++++-
 arch/arm64/kernel/signal.c  | 26 ++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b4bb67f17a2c..f4a91bf1ce0c 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -527,6 +527,15 @@ static void erratum_1418040_thread_switch(struct task_struct *prev,
 	write_sysreg(val, cntkctl_el1);
 }
 
+static void compat_thread_switch(struct task_struct *next)
+{
+	if (!is_compat_thread(task_thread_info(next)))
+		return;
+
+	if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
+		set_tsk_thread_flag(next, TIF_NOTIFY_RESUME);
+}
+
 static void update_sctlr_el1(u64 sctlr)
 {
 	/*
@@ -568,6 +577,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	ssbs_thread_switch(next);
 	erratum_1418040_thread_switch(prev, next);
 	ptrauth_thread_switch_user(next);
+	compat_thread_switch(next);
 
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
@@ -633,8 +643,15 @@ unsigned long arch_align_stack(unsigned long sp)
  */
 void arch_setup_new_exec(void)
 {
-	current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
+	unsigned long mmflags = 0;
+
+	if (is_compat_task()) {
+		mmflags = MMCF_AARCH32;
+		if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
+			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+	}
 
+	current->mm->context.flags = mmflags;
 	ptrauth_thread_init_user();
 	mte_thread_init_user();
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6237486ff6bb..f8192f4ae0b8 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -911,6 +911,19 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
+static bool cpu_affinity_invalid(struct pt_regs *regs)
+{
+	if (!compat_user_mode(regs))
+		return false;
+
+	/*
+	 * We're preemptible, but a reschedule will cause us to check the
+	 * affinity again.
+	 */
+	return !cpumask_test_cpu(raw_smp_processor_id(),
+				 system_32bit_el0_cpumask());
+}
+
 asmlinkage void do_notify_resume(struct pt_regs *regs,
 				 unsigned long thread_flags)
 {
@@ -938,6 +951,19 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 			if (thread_flags & _TIF_NOTIFY_RESUME) {
 				tracehook_notify_resume(regs);
 				rseq_handle_notify_resume(NULL, regs);
+
+				/*
+				 * If we reschedule after checking the affinity
+				 * then we must ensure that TIF_NOTIFY_RESUME
+				 * is set so that we check the affinity again.
+				 * Since tracehook_notify_resume() clears the
+				 * flag, ensure that the compiler doesn't move
+				 * it after the affinity check.
+				 */
+				barrier();
+
+				if (cpu_affinity_invalid(regs))
+					force_sig(SIGKILL);
 			}
 
 			if (thread_flags & _TIF_FOREIGN_FPSTATE)
-- 
cgit v1.2.3


From b27a9f4119afa460289cd327f403e2ec9c8e0511 Mon Sep 17 00:00:00 2001
From: Daniel Kiss <daniel.kiss@arm.com>
Date: Sun, 13 Jun 2021 11:26:31 +0200
Subject: arm64: Add ARM64_PTR_AUTH_KERNEL config option

This patch add the ARM64_PTR_AUTH_KERNEL config and deals with the
build aspect of it.

Userspace support has no dependency on the toolchain therefore all
toolchain checks and build flags are controlled the new config
option.
The default config behavior will not be changed.

Signed-off-by: Daniel Kiss <daniel.kiss@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210613092632.93591-2-daniel.kiss@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig              | 33 +++++++++++++++++++--------------
 arch/arm64/Makefile             |  2 +-
 arch/arm64/kernel/asm-offsets.c |  2 ++
 drivers/misc/lkdtm/bugs.c       |  6 +++---
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d8566bbf9..489e3e42320f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1481,12 +1481,6 @@ menu "ARMv8.3 architectural features"
 config ARM64_PTR_AUTH
 	bool "Enable support for pointer authentication"
 	default y
-	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
-	# Modern compilers insert a .note.gnu.property section note for PAC
-	# which is only understood by binutils starting with version 2.33.1.
-	depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
-	depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
-	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
 	help
 	  Pointer authentication (part of the ARMv8.3 Extensions) provides
 	  instructions for signing and authenticating pointers against secret
@@ -1498,13 +1492,6 @@ config ARM64_PTR_AUTH
 	  for each process at exec() time, with these keys being
 	  context-switched along with the process.
 
-	  If the compiler supports the -mbranch-protection or
-	  -msign-return-address flag (e.g. GCC 7 or later), then this option
-	  will also cause the kernel itself to be compiled with return address
-	  protection. In this case, and if the target hardware is known to
-	  support pointer authentication, then CONFIG_STACKPROTECTOR can be
-	  disabled with minimal loss of protection.
-
 	  The feature is detected at runtime. If the feature is not present in
 	  hardware it will not be advertised to userspace/KVM guest nor will it
 	  be enabled.
@@ -1515,6 +1502,24 @@ config ARM64_PTR_AUTH
 	  but with the feature disabled. On such a system, this option should
 	  not be selected.
 
+config ARM64_PTR_AUTH_KERNEL
+	bool
+	default y
+	depends on ARM64_PTR_AUTH
+	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
+	# Modern compilers insert a .note.gnu.property section note for PAC
+	# which is only understood by binutils starting with version 2.33.1.
+	depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
+	depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
+	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
+	help
+	  If the compiler supports the -mbranch-protection or
+	  -msign-return-address flag (e.g. GCC 7 or later), then this option
+	  will cause the kernel itself to be compiled with return address
+	  protection. In this case, and if the target hardware is known to
+	  support pointer authentication, then CONFIG_STACKPROTECTOR can be
+	  disabled with minimal loss of protection.
+
 	  This feature works with FUNCTION_GRAPH_TRACER option only if
 	  DYNAMIC_FTRACE_WITH_REGS is enabled.
 
@@ -1606,7 +1611,7 @@ config ARM64_BTI_KERNEL
 	bool "Use Branch Target Identification for kernel"
 	default y
 	depends on ARM64_BTI
-	depends on ARM64_PTR_AUTH
+	depends on ARM64_PTR_AUTH_KERNEL
 	depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697
 	depends on !CC_IS_GCC || GCC_VERSION >= 100100
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index b52481f0605d..3b5b1c480449 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -70,7 +70,7 @@ endif
 # off, this will be overridden if we are using branch protection.
 branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
 
-ifeq ($(CONFIG_ARM64_PTR_AUTH),y)
+ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y)
 branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all
 # We enable additional protection for leaf functions as there is some
 # narrow potential for ROP protection benefits and no substantial
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0cb34ccb6e73..03420b89c602 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -46,6 +46,8 @@ int main(void)
   DEFINE(THREAD_SCTLR_USER,	offsetof(struct task_struct, thread.sctlr_user));
 #ifdef CONFIG_ARM64_PTR_AUTH
   DEFINE(THREAD_KEYS_USER,	offsetof(struct task_struct, thread.keys_user));
+#endif
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
   DEFINE(THREAD_KEYS_KERNEL,	offsetof(struct task_struct, thread.keys_kernel));
 #endif
 #ifdef CONFIG_ARM64_MTE
diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 0e8254d0cf0b..a164896dc6d4 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -463,7 +463,7 @@ void lkdtm_DOUBLE_FAULT(void)
 #ifdef CONFIG_ARM64
 static noinline void change_pac_parameters(void)
 {
-	if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) {
+	if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) {
 		/* Reset the keys of current task */
 		ptrauth_thread_init_kernel(current);
 		ptrauth_thread_switch_kernel(current);
@@ -477,8 +477,8 @@ noinline void lkdtm_CORRUPT_PAC(void)
 #define CORRUPT_PAC_ITERATE	10
 	int i;
 
-	if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH))
-		pr_err("FAIL: kernel not built with CONFIG_ARM64_PTR_AUTH\n");
+	if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL))
+		pr_err("FAIL: kernel not built with CONFIG_ARM64_PTR_AUTH_KERNEL\n");
 
 	if (!system_supports_address_auth()) {
 		pr_err("FAIL: CPU lacks pointer authentication feature\n");
-- 
cgit v1.2.3


From d053e71ac8442d4fd24fb85591489813cdb56365 Mon Sep 17 00:00:00 2001
From: Daniel Kiss <daniel.kiss@arm.com>
Date: Sun, 13 Jun 2021 11:26:32 +0200
Subject: arm64: Conditionally configure PTR_AUTH key of the kernel.

If the kernel is not compiled with CONFIG_ARM64_PTR_AUTH_KERNEL=y,
then no PACI/AUTI instructions are expected while the kernel is running
so the kernel's key will not be used. Write of a system registers
is expensive therefore avoid if not required.

Signed-off-by: Daniel Kiss <daniel.kiss@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210613092632.93591-3-daniel.kiss@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig                        |  2 +-
 arch/arm64/include/asm/asm_pointer_auth.h | 49 ++++++++++++++-----------
 arch/arm64/include/asm/pointer_auth.h     | 59 +++++++++++++++++--------------
 arch/arm64/include/asm/processor.h        |  2 ++
 arch/arm64/kernel/asm-offsets.c           |  2 ++
 5 files changed, 67 insertions(+), 47 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 489e3e42320f..dabe9b81012f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1503,7 +1503,7 @@ config ARM64_PTR_AUTH
 	  not be selected.
 
 config ARM64_PTR_AUTH_KERNEL
-	bool
+	bool "Use pointer authentication for kernel"
 	default y
 	depends on ARM64_PTR_AUTH
 	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h
index 8ca2dc0661ee..f1bba5fc61c4 100644
--- a/arch/arm64/include/asm/asm_pointer_auth.h
+++ b/arch/arm64/include/asm/asm_pointer_auth.h
@@ -7,19 +7,7 @@
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
 
-#ifdef CONFIG_ARM64_PTR_AUTH
-/*
- * thread.keys_user.ap* as offset exceeds the #imm offset range
- * so use the base value of ldp as thread.keys_user and offset as
- * thread.keys_user.ap*.
- */
-	.macro __ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
-	mov	\tmp1, #THREAD_KEYS_USER
-	add	\tmp1, \tsk, \tmp1
-	ldp	\tmp2, \tmp3, [\tmp1, #PTRAUTH_USER_KEY_APIA]
-	msr_s	SYS_APIAKEYLO_EL1, \tmp2
-	msr_s	SYS_APIAKEYHI_EL1, \tmp3
-	.endm
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
 
 	.macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
 	mov	\tmp1, #THREAD_KEYS_KERNEL
@@ -42,6 +30,33 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
 alternative_else_nop_endif
 	.endm
 
+#else /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+	.macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
+	.endm
+
+	.macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
+	.endm
+
+	.macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3
+	.endm
+
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+#ifdef CONFIG_ARM64_PTR_AUTH
+/*
+ * thread.keys_user.ap* as offset exceeds the #imm offset range
+ * so use the base value of ldp as thread.keys_user and offset as
+ * thread.keys_user.ap*.
+ */
+	.macro __ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
+	mov	\tmp1, #THREAD_KEYS_USER
+	add	\tmp1, \tsk, \tmp1
+	ldp	\tmp2, \tmp3, [\tmp1, #PTRAUTH_USER_KEY_APIA]
+	msr_s	SYS_APIAKEYLO_EL1, \tmp2
+	msr_s	SYS_APIAKEYHI_EL1, \tmp3
+	.endm
+
 	.macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3
 	mrs	\tmp1, id_aa64isar1_el1
 	ubfx	\tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8
@@ -64,17 +79,11 @@ alternative_else_nop_endif
 .Lno_addr_auth\@:
 	.endm
 
-#else /* CONFIG_ARM64_PTR_AUTH */
+#else /* !CONFIG_ARM64_PTR_AUTH */
 
 	.macro ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
 	.endm
 
-	.macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
-	.endm
-
-	.macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3
-	.endm
-
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
 #endif /* __ASM_ASM_POINTER_AUTH_H */
diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h
index d50416be99be..28a78b67d9b4 100644
--- a/arch/arm64/include/asm/pointer_auth.h
+++ b/arch/arm64/include/asm/pointer_auth.h
@@ -31,10 +31,6 @@ struct ptrauth_keys_user {
 	struct ptrauth_key apga;
 };
 
-struct ptrauth_keys_kernel {
-	struct ptrauth_key apia;
-};
-
 #define __ptrauth_key_install_nosync(k, v)			\
 do {								\
 	struct ptrauth_key __pki_v = (v);			\
@@ -42,6 +38,29 @@ do {								\
 	write_sysreg_s(__pki_v.hi, SYS_ ## k ## KEYHI_EL1);	\
 } while (0)
 
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+
+struct ptrauth_keys_kernel {
+	struct ptrauth_key apia;
+};
+
+static __always_inline void ptrauth_keys_init_kernel(struct ptrauth_keys_kernel *keys)
+{
+	if (system_supports_address_auth())
+		get_random_bytes(&keys->apia, sizeof(keys->apia));
+}
+
+static __always_inline void ptrauth_keys_switch_kernel(struct ptrauth_keys_kernel *keys)
+{
+	if (!system_supports_address_auth())
+		return;
+
+	__ptrauth_key_install_nosync(APIA, keys->apia);
+	isb();
+}
+
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
 static inline void ptrauth_keys_install_user(struct ptrauth_keys_user *keys)
 {
 	if (system_supports_address_auth()) {
@@ -69,21 +88,6 @@ static inline void ptrauth_keys_init_user(struct ptrauth_keys_user *keys)
 	ptrauth_keys_install_user(keys);
 }
 
-static __always_inline void ptrauth_keys_init_kernel(struct ptrauth_keys_kernel *keys)
-{
-	if (system_supports_address_auth())
-		get_random_bytes(&keys->apia, sizeof(keys->apia));
-}
-
-static __always_inline void ptrauth_keys_switch_kernel(struct ptrauth_keys_kernel *keys)
-{
-	if (!system_supports_address_auth())
-		return;
-
-	__ptrauth_key_install_nosync(APIA, keys->apia);
-	isb();
-}
-
 extern int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg);
 
 extern int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys,
@@ -121,11 +125,6 @@ static __always_inline void ptrauth_enable(void)
 #define ptrauth_thread_switch_user(tsk)                                        \
 	ptrauth_keys_install_user(&(tsk)->thread.keys_user)
 
-#define ptrauth_thread_init_kernel(tsk)					\
-	ptrauth_keys_init_kernel(&(tsk)->thread.keys_kernel)
-#define ptrauth_thread_switch_kernel(tsk)				\
-	ptrauth_keys_switch_kernel(&(tsk)->thread.keys_kernel)
-
 #else /* CONFIG_ARM64_PTR_AUTH */
 #define ptrauth_enable()
 #define ptrauth_prctl_reset_keys(tsk, arg)	(-EINVAL)
@@ -134,11 +133,19 @@ static __always_inline void ptrauth_enable(void)
 #define ptrauth_strip_insn_pac(lr)	(lr)
 #define ptrauth_suspend_exit()
 #define ptrauth_thread_init_user()
-#define ptrauth_thread_init_kernel(tsk)
 #define ptrauth_thread_switch_user(tsk)
-#define ptrauth_thread_switch_kernel(tsk)
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+#define ptrauth_thread_init_kernel(tsk)					\
+	ptrauth_keys_init_kernel(&(tsk)->thread.keys_kernel)
+#define ptrauth_thread_switch_kernel(tsk)				\
+	ptrauth_keys_switch_kernel(&(tsk)->thread.keys_kernel)
+#else
+#define ptrauth_thread_init_kernel(tsk)
+#define ptrauth_thread_switch_kernel(tsk)
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
 #define PR_PAC_ENABLED_KEYS_MASK                                               \
 	(PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY | PR_PAC_APDBKEY)
 
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 9df3feeee890..e7d50c6f700d 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -148,8 +148,10 @@ struct thread_struct {
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
 	struct ptrauth_keys_user	keys_user;
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
 	struct ptrauth_keys_kernel	keys_kernel;
 #endif
+#endif
 #ifdef CONFIG_ARM64_MTE
 	u64			gcr_user_excl;
 #endif
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 03420b89c602..c9e72d92606f 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -155,7 +155,9 @@ int main(void)
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
   DEFINE(PTRAUTH_USER_KEY_APIA,		offsetof(struct ptrauth_keys_user, apia));
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
   DEFINE(PTRAUTH_KERNEL_KEY_APIA,	offsetof(struct ptrauth_keys_kernel, apia));
+#endif
   BLANK();
 #endif
   return 0;
-- 
cgit v1.2.3


From 0f473ac746a992b3afd994ccd1ac73052ea256f2 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 14 Jun 2021 15:40:11 +0530
Subject: arm64/mm: Drop SWAPPER_INIT_MAP_SIZE

The commit cdef5f6e9e0e ("arm64: mm: allocate pagetables anywhere") had
dropped the last reference to SWAPPER_INIT_MAP_SIZE. Hence just clean up.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Link: https://lore.kernel.org/r/1623665411-20055-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kernel-pgtable.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index d44df9d62fc9..e2f103cce7c1 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -100,9 +100,6 @@
 #define SWAPPER_TABLE_SHIFT	PMD_SHIFT
 #endif
 
-/* The size of the initial kernel direct mapping */
-#define SWAPPER_INIT_MAP_SIZE	(_AC(1, UL) << SWAPPER_TABLE_SHIFT)
-
 /*
  * Initial memory map attributes.
  */
-- 
cgit v1.2.3


From ca6ece6a76a8b5d8b428429c2803df48a69ee88b Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 14 Jun 2021 15:12:35 +0530
Subject: arm64/mm: Use CONT_PMD_SHIFT for ARM64_MEMSTART_SHIFT

ARM64_MEMSTART_SIZE needs to be aligned with CONT_PMD_SIZE on 16K page size
config. Hence just directly use CONT_PMD_SHIFT.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/1623663755-8949-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kernel-pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index e2f103cce7c1..c5f18f2408b5 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -122,7 +122,7 @@
 #if defined(CONFIG_ARM64_4K_PAGES)
 #define ARM64_MEMSTART_SHIFT		PUD_SHIFT
 #elif defined(CONFIG_ARM64_16K_PAGES)
-#define ARM64_MEMSTART_SHIFT		(PMD_SHIFT + 5)
+#define ARM64_MEMSTART_SHIFT		CONT_PMD_SHIFT
 #else
 #define ARM64_MEMSTART_SHIFT		PMD_SHIFT
 #endif
-- 
cgit v1.2.3


From 4aaa87ab3d2de485d8aae7a88cc9cb02dcd2c450 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 14 Jun 2021 13:48:26 +0530
Subject: arm64/mm: Drop SECTION_[SHIFT|SIZE|MASK]

SECTION_[SHIFT|SIZE|MASK] are essentially PMD_[SHIFT|SIZE|MASK]. But these
create confusion being similar to generic sparsemem memory sections, which
are derived from SECTION_SIZE_BITS. Section references have always implied
PMD level block mapping. Instead just use all PMD level macros which would
make it explicit and also remove confusion with sparsmem memory sections.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/1623658706-7182-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kernel-pgtable.h | 4 ++--
 arch/arm64/include/asm/pgtable-hwdef.h  | 7 -------
 arch/arm64/mm/mmu.c                     | 2 +-
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index c5f18f2408b5..1260187adb31 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -91,8 +91,8 @@
 
 /* Initial memory map size */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
-#define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
-#define SWAPPER_BLOCK_SIZE	SECTION_SIZE
+#define SWAPPER_BLOCK_SHIFT	PMD_SHIFT
+#define SWAPPER_BLOCK_SIZE	PMD_SIZE
 #define SWAPPER_TABLE_SHIFT	PUD_SHIFT
 #else
 #define SWAPPER_BLOCK_SHIFT	PAGE_SHIFT
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index b82575a33f8b..40085e53f573 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -71,13 +71,6 @@
 #define PGDIR_MASK		(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD		(1 << (VA_BITS - PGDIR_SHIFT))
 
-/*
- * Section address mask and size definitions.
- */
-#define SECTION_SHIFT		PMD_SHIFT
-#define SECTION_SIZE		(_AC(1, UL) << SECTION_SHIFT)
-#define SECTION_MASK		(~(SECTION_SIZE-1))
-
 /*
  * Contiguous page definitions.
  */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3d34cd127f6b..5b75f7eefb72 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -228,7 +228,7 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 		next = pmd_addr_end(addr, end);
 
 		/* try section mapping first */
-		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
+		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
 			pmd_set_huge(pmdp, phys, prot);
 
-- 
cgit v1.2.3


From 84c5e23edecd7013ceaed8460deed5c33842cb8d Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Mon, 14 Jun 2021 20:27:01 +0800
Subject: arm64: mm: Pass original fault address to handle_mm_fault()

Currently, the lower bits of fault address is cleared before it's
passed to handle_mm_fault(). It's unnecessary since generic code
does same thing since the commit 1a29d85eb0f19 ("mm: use vmf->address
instead of of vmf->virtual_address").

This passes the original fault address to handle_mm_fault() in case
the generic code needs to know the exact fault address.

Signed-off-by: Gavin Shan <gshan@redhat.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20210614122701.100515-1-gshan@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 6786cf152666..bd9a0bb5fb56 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -509,7 +509,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (!(vma->vm_flags & vm_flags))
 		return VM_FAULT_BADACCESS;
-	return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs);
+	return handle_mm_fault(vma, addr, mm_flags, regs);
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
-- 
cgit v1.2.3


From 9163f01130304fab1f74683d7d44632da7bda637 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 15 Jun 2021 15:02:58 +0530
Subject: arm64/mm: Fix ttbr0 values stored in struct thread_info for
 software-pan

When using CONFIG_ARM64_SW_TTBR0_PAN, a task's thread_info::ttbr0 must be
the TTBR0_EL1 value used to run userspace. With 52-bit PAs, the PA must be
packed into the TTBR using phys_to_ttbr(), but we forget to do this in some
of the SW PAN code. Thus, if the value is installed into TTBR0_EL1 (as may
happen in the uaccess routines), this could result in UNPREDICTABLE
behaviour.

Since hardware with 52-bit PA support almost certainly has HW PAN, which
will be used in preference, this shouldn't be a practical issue, but let's
fix this for consistency.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Fixes: 529c4b05a3cb ("arm64: handle 52-bit addresses in TTBR")
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/1623749578-11231-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/mmu_context.h | 4 ++--
 arch/arm64/kernel/setup.c            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index d3cef9133539..eeb210997149 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -177,9 +177,9 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 		return;
 
 	if (mm == &init_mm)
-		ttbr = __pa_symbol(reserved_pg_dir);
+		ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
 	else
-		ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
+		ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;
 
 	WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
 }
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 61845c0821d9..68b30e8c22db 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -381,7 +381,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	 * faults in case uaccess_enable() is inadvertently called by the init
 	 * thread.
 	 */
-	init_task.thread_info.ttbr0 = __pa_symbol(reserved_pg_dir);
+	init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
 #endif
 
 	if (boot_args[1] || boot_args[2] || boot_args[3]) {
-- 
cgit v1.2.3


From c70fe14f83ae0793a1119fa5741b19ab9ba411b2 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Tue, 18 May 2021 18:14:03 +0800
Subject: arm64: mm: fix the count comments in compute_indices

'count - 1' is confusing and not comply with the real code running.
'count' actually represents the extra entries required, no need minus 1.

Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210518101405.1048860-3-aisheng.dong@nxp.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/head.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 96873dfa67fd..b70db34458ec 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -195,7 +195,7 @@ SYM_CODE_END(preserve_boot_args)
 	and	\iend, \iend, \istart	// iend = (vend >> shift) & (ptrs - 1)
 	mov	\istart, \ptrs
 	mul	\istart, \istart, \count
-	add	\iend, \iend, \istart	// iend += (count - 1) * ptrs
+	add	\iend, \iend, \istart	// iend += count * ptrs
 					// our entries span multiple tables
 
 	lsr	\istart, \vstart, \shift
-- 
cgit v1.2.3


From f91671b5418bde81a7ce6bb2e9f3f4d41184b77c Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Tue, 18 May 2021 18:14:04 +0800
Subject: arm64: mm: drop unused __pa(__idmap_text_start)

x5 is not used in the following map_memory. Instead,
__pa(__idmap_text_start) is stored in x3 which is used later.

Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518101405.1048860-4-aisheng.dong@nxp.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/head.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index b70db34458ec..d266b4c6287d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -354,7 +354,6 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 #endif
 1:
 	ldr_l	x4, idmap_ptrs_per_pgd
-	mov	x5, x3				// __pa(__idmap_text_start)
 	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
 
 	map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
-- 
cgit v1.2.3


From 7957a3db01bf533a235a9ae9333150abbe6bde32 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Tue, 18 May 2021 18:14:05 +0800
Subject: arm64: head: fix code comments in set_cpu_boot_mode_flag

Up to here, the CPU boot mode can either be EL1 or EL2.
Correct the code comments a bit.

Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518101405.1048860-5-aisheng.dong@nxp.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/head.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index d266b4c6287d..3b88000841d9 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -550,7 +550,7 @@ SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
 	cmp	w0, #BOOT_CPU_MODE_EL2
 	b.ne	1f
 	add	x1, x1, #4
-1:	str	w0, [x1]			// This CPU has booted in EL1
+1:	str	w0, [x1]			// Save CPU boot mode
 	dmb	sy
 	dc	ivac, x1			// Invalidate potentially stale cache line
 	ret
-- 
cgit v1.2.3


From 8848f0665b3cd4fbb3107b384f5205380c90634d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 15 Jun 2021 12:12:24 +0100
Subject: arm64: Add cpuidle context save/restore helpers

As we need to start doing some additional work on all idle
paths, let's introduce a set of macros that will perform
the work related to the GICv3 pseudo-NMI idle entry exit.

Stubs are introduced to 32bit ARM for compatibility.
As these helpers are currently unused, there is no functional
change.

Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210615111227.2454465-2-maz@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm/include/asm/cpuidle.h   |  5 +++++
 arch/arm64/include/asm/cpuidle.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/arch/arm/include/asm/cpuidle.h b/arch/arm/include/asm/cpuidle.h
index 0d67ed682e07..dc8f53f1a219 100644
--- a/arch/arm/include/asm/cpuidle.h
+++ b/arch/arm/include/asm/cpuidle.h
@@ -49,4 +49,9 @@ extern int arm_cpuidle_suspend(int index);
 
 extern int arm_cpuidle_init(int cpu);
 
+struct arm_cpuidle_irq_context { };
+
+#define arm_cpuidle_save_irq_context(c)		(void)c
+#define arm_cpuidle_restore_irq_context(c)	(void)c
+
 #endif
diff --git a/arch/arm64/include/asm/cpuidle.h b/arch/arm64/include/asm/cpuidle.h
index 3c5ddb429ea2..14a19d1141bd 100644
--- a/arch/arm64/include/asm/cpuidle.h
+++ b/arch/arm64/include/asm/cpuidle.h
@@ -18,4 +18,39 @@ static inline int arm_cpuidle_suspend(int index)
 	return -EOPNOTSUPP;
 }
 #endif
+
+#ifdef CONFIG_ARM64_PSEUDO_NMI
+#include <asm/arch_gicv3.h>
+
+struct arm_cpuidle_irq_context {
+	unsigned long pmr;
+	unsigned long daif_bits;
+};
+
+#define arm_cpuidle_save_irq_context(__c)				\
+	do {								\
+		struct arm_cpuidle_irq_context *c = __c;		\
+		if (system_uses_irq_prio_masking()) {			\
+			c->daif_bits = read_sysreg(daif);		\
+			write_sysreg(c->daif_bits | PSR_I_BIT | PSR_F_BIT, \
+				     daif);				\
+			c->pmr = gic_read_pmr();			\
+			gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); \
+		}							\
+	} while (0)
+
+#define arm_cpuidle_restore_irq_context(__c)				\
+	do {								\
+		struct arm_cpuidle_irq_context *c = __c;		\
+		if (system_uses_irq_prio_masking()) {			\
+			gic_write_pmr(c->pmr);				\
+			write_sysreg(c->daif_bits, daif);		\
+		}							\
+	} while (0)
+#else
+struct arm_cpuidle_irq_context { };
+
+#define arm_cpuidle_save_irq_context(c)		(void)c
+#define arm_cpuidle_restore_irq_context(c)	(void)c
+#endif
 #endif
-- 
cgit v1.2.3


From d4dc10277255afc303de4f00cbee0b9ce74d870f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 15 Jun 2021 12:12:25 +0100
Subject: arm64: Convert cpu_do_idle() to using cpuidle context helpers

Now that we have helpers that are aware of the pseudo-NMI
feature, introduce them to cpu_do_idle(). This allows for
some nice cleanup.

No functional change intended.

Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210615111227.2454465-3-maz@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/process.c | 41 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b4bb67f17a2c..b715c6b2558f 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -46,9 +46,9 @@
 #include <linux/prctl.h>
 
 #include <asm/alternative.h>
-#include <asm/arch_gicv3.h>
 #include <asm/compat.h>
 #include <asm/cpufeature.h>
+#include <asm/cpuidle.h>
 #include <asm/cacheflush.h>
 #include <asm/exec.h>
 #include <asm/fpsimd.h>
@@ -74,33 +74,6 @@ EXPORT_SYMBOL_GPL(pm_power_off);
 
 void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 
-static void noinstr __cpu_do_idle(void)
-{
-	dsb(sy);
-	wfi();
-}
-
-static void noinstr __cpu_do_idle_irqprio(void)
-{
-	unsigned long pmr;
-	unsigned long daif_bits;
-
-	daif_bits = read_sysreg(daif);
-	write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif);
-
-	/*
-	 * Unmask PMR before going idle to make sure interrupts can
-	 * be raised.
-	 */
-	pmr = gic_read_pmr();
-	gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
-	__cpu_do_idle();
-
-	gic_write_pmr(pmr);
-	write_sysreg(daif_bits, daif);
-}
-
 /*
  *	cpu_do_idle()
  *
@@ -112,10 +85,14 @@ static void noinstr __cpu_do_idle_irqprio(void)
  */
 void noinstr cpu_do_idle(void)
 {
-	if (system_uses_irq_prio_masking())
-		__cpu_do_idle_irqprio();
-	else
-		__cpu_do_idle();
+	struct arm_cpuidle_irq_context context;
+
+	arm_cpuidle_save_irq_context(&context);
+
+	dsb(sy);
+	wfi();
+
+	arm_cpuidle_restore_irq_context(&context);
 }
 
 /*
-- 
cgit v1.2.3


From c9223b616298c3d0e6ff5dd20d14d65c2131c535 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 15 Jun 2021 12:12:26 +0100
Subject: PSCI: Use cpuidle context helpers in psci_cpu_suspend_enter()

The PSCI CPU suspend code isn't aware of the PMR vs DAIF game,
resulting in a system that locks up if entering CPU suspend
with GICv3 pNMI enabled.

To save the day, teach the suspend code about our new cpuidle
context helpers, which will do everything that's required just
like the usual WFI cpuidle code.

This fixes my Altra system, which would otherwise lock-up at
boot time when booted with irqchip.gicv3_pseudo_nmi=1.

Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20210615111227.2454465-4-maz@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/firmware/psci/psci.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index 3c1c5daf6df2..e3da38e15c5b 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -335,10 +335,15 @@ int psci_cpu_suspend_enter(u32 state)
 {
 	int ret;
 
-	if (!psci_power_state_loses_context(state))
+	if (!psci_power_state_loses_context(state)) {
+		struct arm_cpuidle_irq_context context;
+
+		arm_cpuidle_save_irq_context(&context);
 		ret = psci_ops.cpu_suspend(state, 0);
-	else
+		arm_cpuidle_restore_irq_context(&context);
+	} else {
 		ret = cpu_suspend(state, psci_suspend_finisher);
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From 77345ef70445a8f16e0685dade0d68bdf41f19d7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 15 Jun 2021 12:12:27 +0100
Subject: arm64: suspend: Use cpuidle context helpers in cpu_suspend()

Use cpuidle context helpers to switch to using DAIF.IF instead
of PMR to mask interrupts, ensuring that we suspend with
interrupts being able to reach the CPU interface.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20210615111227.2454465-5-maz@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/suspend.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index e3f72df9509d..938ce6fbee8a 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -7,6 +7,7 @@
 #include <asm/alternative.h>
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
+#include <asm/cpuidle.h>
 #include <asm/daifflags.h>
 #include <asm/debug-monitors.h>
 #include <asm/exec.h>
@@ -91,6 +92,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	int ret = 0;
 	unsigned long flags;
 	struct sleep_stack_data state;
+	struct arm_cpuidle_irq_context context;
 
 	/* Report any MTE async fault before going to suspend */
 	mte_suspend_enter();
@@ -103,12 +105,18 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	flags = local_daif_save();
 
 	/*
-	 * Function graph tracer state gets incosistent when the kernel
+	 * Function graph tracer state gets inconsistent when the kernel
 	 * calls functions that never return (aka suspend finishers) hence
 	 * disable graph tracing during their execution.
 	 */
 	pause_graph_tracing();
 
+	/*
+	 * Switch to using DAIF.IF instead of PMR in order to reliably
+	 * resume if we're using pseudo-NMIs.
+	 */
+	arm_cpuidle_save_irq_context(&context);
+
 	if (__cpu_suspend_enter(&state)) {
 		/* Call the suspend finisher */
 		ret = fn(arg);
@@ -126,6 +134,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 		RCU_NONIDLE(__cpu_suspend_exit());
 	}
 
+	arm_cpuidle_restore_irq_context(&context);
+
 	unpause_graph_tracing();
 
 	/*
-- 
cgit v1.2.3


From 4e16f283edc289820e9b2d6f617ed8e514ee8396 Mon Sep 17 00:00:00 2001
From: Tuan Phan <tuanphan@os.amperecomputing.com>
Date: Thu, 17 Jun 2021 09:08:49 -0700
Subject: perf/arm-cmn: Fix invalid pointer when access dtc object sharing the
 same IRQ number

When multiple dtcs share the same IRQ number, the irq_friend which
used to refer to dtc object gets calculated incorrect which leads
to invalid pointer.

Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver")

Signed-off-by: Tuan Phan <tuanphan@os.amperecomputing.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1623946129-3290-1-git-send-email-tuanphan@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 4f46f654279d..bc3cba5f8c5d 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1212,7 +1212,7 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 		irq = cmn->dtc[i].irq;
 		for (j = i; j--; ) {
 			if (cmn->dtc[j].irq == irq) {
-				cmn->dtc[j].irq_friend = j - i;
+				cmn->dtc[j].irq_friend = i - j;
 				goto next;
 			}
 		}
-- 
cgit v1.2.3


From d96b1b8c9f79b6bb234a31c80972a6f422079376 Mon Sep 17 00:00:00 2001
From: Jing Xiangfeng <jingxiangfeng@huawei.com>
Date: Thu, 17 Jun 2021 20:26:14 +0800
Subject: drivers/perf: fix the missed ida_simple_remove() in ddr_perf_probe()

ddr_perf_probe() misses to call ida_simple_remove() in an error path.
Jump to cpuhp_state_err to fix it.

Signed-off-by: Jing Xiangfeng <jingxiangfeng@huawei.com>
Reviewed-by: Dong Aisheng <aisheng.dong@nxp.com>
Link: https://lore.kernel.org/r/20210617122614.166823-1-jingxiangfeng@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/fsl_imx8_ddr_perf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 2a1d78794a4e..94ebc1ecace7 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -702,8 +702,10 @@ static int ddr_perf_probe(struct platform_device *pdev)
 
 	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, DDR_PERF_DEV_NAME "%d",
 			      num);
-	if (!name)
-		return -ENOMEM;
+	if (!name) {
+		ret = -ENOMEM;
+		goto cpuhp_state_err;
+	}
 
 	pmu->devtype_data = of_device_get_match_data(&pdev->dev);
 
-- 
cgit v1.2.3


From cf814bcfa1e661d6d2fe74ed6da3d2aa558c894a Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 17 Jun 2021 08:30:59 +0100
Subject: arm64: smp: Bump debugging information print down to KERN_DEBUG

This sort of information is only generally useful when debugging.

No need to have these sprinkled through the kernel log otherwise.

Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20210617073059.315542-1-lee.jones@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dcd7041b2b07..4d13b1d98e1c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -352,7 +352,7 @@ void __cpu_die(unsigned int cpu)
 		pr_crit("CPU%u: cpu didn't die\n", cpu);
 		return;
 	}
-	pr_notice("CPU%u: shutdown\n", cpu);
+	pr_debug("CPU%u: shutdown\n", cpu);
 
 	/*
 	 * Now that the dying CPU is beyond the point of no return w.r.t.
-- 
cgit v1.2.3


From 69bb0585ebb0c48c93fc55fc27afbfc06adef2fd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 18 Jun 2021 16:11:22 +0100
Subject: arm64: insn: avoid circular include dependency

Nathan reports that when building with CONFIG_LTO_CLANG_THIN=y, the
build fails due to BUILD_BUG_ON() not being defined before its uss in
<asm/insn.h>.

The problem is that with LTO, we patch READ_ONCE(), and <asm/rwonce.h>
includes <asm/insn.h>, creating a circular include chain:

        <linux/build_bug.h>
        <linux/compiler.h>
        <asm/rwonce.h>
        <asm/alternative-macros.h>
        <asm/insn.h>
        <linux/build-bug.h>

... and so when <asm/insn.h> includes <linux/build_bug.h>, none of the
BUILD_BUG* definitions have happened yet.

To avoid this, let's move AARCH64_INSN_SIZE into a header without any
dependencies, such that it can always be safely included. At the same
time, avoid including <asm/alternative.h> in <asm/insn.h>, which should
no longer be necessary (and doesn't make sense when insn.h is consumed
by userspace).

Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210621080830.GA37068@C02TD0UTHF1T.local
Fixes: 3e00e39d9dad ("arm64: insn: move AARCH64_INSN_SIZE into <asm/insn.h>")
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/alternative-macros.h | 2 +-
 arch/arm64/include/asm/insn-def.h           | 9 +++++++++
 arch/arm64/include/asm/insn.h               | 5 +----
 3 files changed, 11 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm64/include/asm/insn-def.h

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 703fbf310b79..eba3173a2a2c 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -3,7 +3,7 @@
 #define __ASM_ALTERNATIVE_MACROS_H
 
 #include <asm/cpucaps.h>
-#include <asm/insn.h>
+#include <asm/insn-def.h>
 
 #define ARM64_CB_PATCH ARM64_NCAPS
 
diff --git a/arch/arm64/include/asm/insn-def.h b/arch/arm64/include/asm/insn-def.h
new file mode 100644
index 000000000000..2c075f615c6a
--- /dev/null
+++ b/arch/arm64/include/asm/insn-def.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_INSN_DEF_H
+#define __ASM_INSN_DEF_H
+
+/* A64 instructions are always 32 bits. */
+#define	AARCH64_INSN_SIZE		4
+
+#endif /* __ASM_INSN_DEF_H */
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 1430b4973039..6b776c8667b2 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -10,10 +10,7 @@
 #include <linux/build_bug.h>
 #include <linux/types.h>
 
-#include <asm/alternative.h>
-
-/* A64 instructions are always 32 bits. */
-#define	AARCH64_INSN_SIZE		4
+#include <asm/insn-def.h>
 
 #ifndef __ASSEMBLY__
 /*
-- 
cgit v1.2.3


From 2062d44da3499eed3c7d005df8f0b54d300ac0b5 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 18 Jun 2021 10:17:02 +0530
Subject: arm64/mm: Rename ARM64_SWAPPER_USES_SECTION_MAPS

ARM64_SWAPPER_USES_SECTION_MAPS implies that a PMD level huge page mappings
are used for swapper, idmap and vmemmap. Lets make it PMD explicit removing
any possible confusion with generic memory sections and also bit generic as
it's applicable for idmap and vmemmap mappings as well. Hence rename it as
ARM64_KERNEL_USES_PMD_MAPS instead.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/1623991622-24294-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kernel-pgtable.h | 10 +++++-----
 arch/arm64/mm/mmu.c                     |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 1260187adb31..3512184cfec1 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -18,9 +18,9 @@
  * 64K (section size = 512M).
  */
 #ifdef CONFIG_ARM64_4K_PAGES
-#define ARM64_SWAPPER_USES_SECTION_MAPS 1
+#define ARM64_KERNEL_USES_PMD_MAPS 1
 #else
-#define ARM64_SWAPPER_USES_SECTION_MAPS 0
+#define ARM64_KERNEL_USES_PMD_MAPS 0
 #endif
 
 /*
@@ -33,7 +33,7 @@
  * VA range, so pages required to map highest possible PA are reserved in all
  * cases.
  */
-#if ARM64_SWAPPER_USES_SECTION_MAPS
+#if ARM64_KERNEL_USES_PMD_MAPS
 #define SWAPPER_PGTABLE_LEVELS	(CONFIG_PGTABLE_LEVELS - 1)
 #define IDMAP_PGTABLE_LEVELS	(ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT) - 1)
 #else
@@ -90,7 +90,7 @@
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
 /* Initial memory map size */
-#if ARM64_SWAPPER_USES_SECTION_MAPS
+#if ARM64_KERNEL_USES_PMD_MAPS
 #define SWAPPER_BLOCK_SHIFT	PMD_SHIFT
 #define SWAPPER_BLOCK_SIZE	PMD_SIZE
 #define SWAPPER_TABLE_SHIFT	PUD_SHIFT
@@ -106,7 +106,7 @@
 #define SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
 #define SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
 
-#if ARM64_SWAPPER_USES_SECTION_MAPS
+#if ARM64_KERNEL_USES_PMD_MAPS
 #define SWAPPER_MM_MMUFLAGS	(PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
 #else
 #define SWAPPER_MM_MMUFLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 5b75f7eefb72..e04e4b6bdf16 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1113,14 +1113,14 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
 }
 #endif
 
-#if !ARM64_SWAPPER_USES_SECTION_MAPS
+#if !ARM64_KERNEL_USES_PMD_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
 	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
 	return vmemmap_populate_basepages(start, end, node, altmap);
 }
-#else	/* !ARM64_SWAPPER_USES_SECTION_MAPS */
+#else	/* !ARM64_KERNEL_USES_PMD_MAPS */
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
@@ -1165,7 +1165,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 
 	return 0;
 }
-#endif	/* !ARM64_SWAPPER_USES_SECTION_MAPS */
+#endif	/* !ARM64_KERNEL_USES_PMD_MAPS */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
-- 
cgit v1.2.3


From cf292e93f423fdebdf751a22ea01249196806328 Mon Sep 17 00:00:00 2001
From: Raphael Gault <raphael.gault@arm.com>
Date: Mon, 17 May 2021 13:02:56 -0500
Subject: arm64: Restrict undef hook for cpufeature registers

This commit modifies the mask of the mrs_hook declared in
arch/arm64/kernel/cpufeatures.c which emulates only feature register
access. This is necessary because this hook's mask was too large and
thus masking any mrs instruction, even if not related to the emulated
registers which made the pmu emulation inefficient.

Signed-off-by: Raphael Gault <raphael.gault@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210517180256.2881891-1-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/cpufeature.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 52389018ff33..dbae006f625f 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3018,8 +3018,8 @@ static int emulate_mrs(struct pt_regs *regs, u32 insn)
 }
 
 static struct undef_hook mrs_hook = {
-	.instr_mask = 0xfff00000,
-	.instr_val  = 0xd5300000,
+	.instr_mask = 0xffff0000,
+	.instr_val  = 0xd5380000,
 	.pstate_mask = PSR_AA32_MODE_MASK,
 	.pstate_val = PSR_MODE_EL0t,
 	.fn = emulate_mrs,
-- 
cgit v1.2.3


From 52218fcd61cb42bde0d301db4acb3ffdf3463cc7 Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Wed, 23 Jun 2021 15:05:22 +0800
Subject: arm64: tlb: fix the TTL value of tlb_get_level

The TTL field indicates the level of page table walk holding the *leaf*
entry for the address being invalidated. But currently, the TTL field
may be set to an incorrent value in the following stack:

pte_free_tlb
    __pte_free_tlb
        tlb_remove_table
            tlb_table_invalidate
                tlb_flush_mmu_tlbonly
                    tlb_flush

In this case, we just want to flush a PTE page, but the tlb->cleared_pmds
is set and we get tlb_level = 2 in the tlb_get_level() function. This may
cause some unexpected problems.

This patch set the TTL field to 0 if tlb->freed_tables is set. The
tlb->freed_tables indicates page table pages are freed, not the leaf
entry.

Cc: <stable@vger.kernel.org> # 5.9.x
Fixes: c4ab2cbc1d87 ("arm64: tlb: Set the TTL field in flush_tlb_range")
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: ZhuRui <zhurui3@huawei.com>
Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/b80ead47-1f88-3a00-18e1-cacc22f54cc4@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/tlb.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 61c97d3b58c7..c995d1f4594f 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -28,6 +28,10 @@ static void tlb_flush(struct mmu_gather *tlb);
  */
 static inline int tlb_get_level(struct mmu_gather *tlb)
 {
+	/* The TTL field is only valid for the leaf entry. */
+	if (tlb->freed_tables)
+		return 0;
+
 	if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
 				   tlb->cleared_puds ||
 				   tlb->cleared_p4ds))
-- 
cgit v1.2.3