86 files changed, 4184 insertions, 1070 deletions
diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
new file mode 100644
index 000000000000..294ebbdb22af
--- /dev/null
+++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@ -0,0 +1,106 @@
+================================================
+HiSilicon PCIe Performance Monitoring Unit (PMU)
+================================================
+
+On Hip09, HiSilicon PCIe Performance Monitoring Unit (PMU) could monitor
+bandwidth, latency, bus utilization and buffer occupancy data of PCIe.
+
+Each PCIe Core has a PMU to monitor multi Root Ports of this PCIe Core and
+all Endpoints downstream these Root Ports.
+
+
+HiSilicon PCIe PMU driver
+=========================
+
+The PCIe PMU driver registers a perf PMU with the name of its sicl-id and PCIe
+Core id.::
+
+  /sys/bus/event_source/hisi_pcie<sicl>_<core>
+
+PMU driver provides description of available events and filter options in sysfs,
+see /sys/bus/event_source/devices/hisi_pcie<sicl>_<core>.
+
+The "format" directory describes all formats of the config (events) and config1
+(filter options) fields of the perf_event_attr structure. The "events" directory
+describes all documented events shown in perf list.
+
+The "identifier" sysfs file allows users to identify the version of the
+PMU hardware device.
+
+The "bus" sysfs file allows users to get the bus number of Root Ports
+monitored by PMU.
+
+Example usage of perf::
+
+  $# perf list
+  hisi_pcie0_0/rx_mwr_latency/ [kernel PMU event]
+  hisi_pcie0_0/rx_mwr_cnt/ [kernel PMU event]
+  ------------------------------------------
+
+  $# perf stat -e hisi_pcie0_0/rx_mwr_latency/
+  $# perf stat -e hisi_pcie0_0/rx_mwr_cnt/
+  $# perf stat -g -e hisi_pcie0_0/rx_mwr_latency/ -e hisi_pcie0_0/rx_mwr_cnt/
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported for PCIe PMU.
+
+Filter options
+--------------
+
+1. Target filter
+PMU could only monitor the performance of traffic downstream target Root Ports
+or downstream target Endpoint. PCIe PMU driver support "port" and "bdf"
+interfaces for users, and these two interfaces aren't supported at the same
+time.
+
+-port
+"port" filter can be used in all PCIe PMU events, target Root Port can be
+selected by configuring the 16-bits-bitmap "port". Multi ports can be selected
+for AP-layer-events, and only one port can be selected for TL/DL-layer-events.
+
+For example, if target Root Port is 0000:00:00.0 (x8 lanes), bit0 of bitmap
+should be set, port=0x1; if target Root Port is 0000:00:04.0 (x4 lanes),
+bit8 is set, port=0x100; if these two Root Ports are both monitored, port=0x101.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mwr_latency,port=0x1/ sleep 5
+
+-bdf
+
+"bdf" filter can only be used in bandwidth events, target Endpoint is selected
+by configuring BDF to "bdf". Counter only counts the bandwidth of message
+requested by target Endpoint.
+
+For example, "bdf=0x3900" means BDF of target Endpoint is 0000:39:00.0.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mrd_flux,bdf=0x3900/ sleep 5
+
+2. Trigger filter
+Event statistics start when the first time TLP length is greater/smaller
+than trigger condition. You can set the trigger condition by writing "trig_len",
+and set the trigger mode by writing "trig_mode". This filter can only be used
+in bandwidth events.
+
+For example, "trig_len=4" means trigger condition is 2^4 DW, "trig_mode=0"
+means statistics start when TLP length > trigger condition, "trig_mode=1"
+means start when TLP length < condition.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
+
+3. Threshold filter
+Counter counts when TLP length within the specified range. You can set the
+threshold by writing "thr_len", and set the threshold mode by writing
+"thr_mode". This filter can only be used in bandwidth events.
+
+For example, "thr_len=4" means threshold is 2^4 DW, "thr_mode=0" means
+counter counts when TLP length >= threshold, and "thr_mode=1" means counts
+when TLP length < threshold.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 0e486f41185e..d359bcfadd39 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -905,6 +905,17 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
 The default value is 8.
 
 
+perf_user_access (arm64 only)
+=================================
+
+Controls user space access for reading perf event counters. When set to 1,
+user space can read performance monitor counter registers directly.
+
+The default value is 0 (access disabled).
+
+See Documentation/arm64/perf.rst for more information.
+
+
 pid_max
 =======
 
diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
index 9f9b8fd06089..749ae970c319 100644
--- a/Documentation/arm64/cpu-feature-registers.rst
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -275,6 +275,23 @@ infrastructure:
      | SVEVer                       | [3-0]   |    y    |
      +------------------------------+---------+---------+
 
+  8) ID_AA64MMFR1_EL1 - Memory model feature register 1
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | AFP                          | [47-44] |    y    |
+     +------------------------------+---------+---------+
+
+  9) ID_AA64ISAR2_EL1 - Instruction set attribute register 2
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | RPRES                        | [7-4]   |    y    |
+     +------------------------------+---------+---------+
+
+
 Appendix I: Example
 -------------------
 
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index af106af8e1c0..b72ff17d600a 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -251,6 +251,14 @@ HWCAP2_ECV
 
     Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001.
 
+HWCAP2_AFP
+
+    Functionality implied by ID_AA64MFR1_EL1.AFP == 0b0001.
+
+HWCAP2_RPRES
+
+    Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/Documentation/arm64/perf.rst b/Documentation/arm64/perf.rst
index b567f177d385..1f87b57c2332 100644
--- a/Documentation/arm64/perf.rst
+++ b/Documentation/arm64/perf.rst
@@ -2,7 +2,10 @@
 
 .. _perf_index:
 
-=====================
+====
+Perf
+====
+
 Perf Event Attributes
 =====================
 
@@ -88,3 +91,76 @@ exclude_host. However when using !exclude_hv there is a small blackout
 window at the guest entry/exit where host events are not captured.
 
 On VHE systems there are no blackout windows.
+
+Perf Userspace PMU Hardware Counter Access
+==========================================
+
+Overview
+--------
+The perf userspace tool relies on the PMU to monitor events. It offers an
+abstraction layer over the hardware counters since the underlying
+implementation is cpu-dependent.
+Arm64 allows userspace tools to have access to the registers storing the
+hardware counters' values directly.
+
+This targets specifically self-monitoring tasks in order to reduce the overhead
+by directly accessing the registers without having to go through the kernel.
+
+How-to
+------
+The focus is set on the armv8 PMUv3 which makes sure that the access to the pmu
+registers is enabled and that the userspace has access to the relevant
+information in order to use them.
+
+In order to have access to the hardware counters, the global sysctl
+kernel/perf_user_access must first be enabled:
+
+.. code-block:: sh
+
+  echo 1 > /proc/sys/kernel/perf_user_access
+
+It is necessary to open the event using the perf tool interface with config1:1
+attr bit set: the sys_perf_event_open syscall returns a fd which can
+subsequently be used with the mmap syscall in order to retrieve a page of memory
+containing information about the event. The PMU driver uses this page to expose
+to the user the hardware counter's index and other necessary data. Using this
+index enables the user to access the PMU registers using the `mrs` instruction.
+Access to the PMU registers is only valid while the sequence lock is unchanged.
+In particular, the PMSELR_EL0 register is zeroed each time the sequence lock is
+changed.
+
+The userspace access is supported in libperf using the perf_evsel__mmap()
+and perf_evsel__read() functions. See `tools/lib/perf/tests/test-evsel.c`_ for
+an example.
+
+About heterogeneous systems
+---------------------------
+On heterogeneous systems such as big.LITTLE, userspace PMU counter access can
+only be enabled when the tasks are pinned to a homogeneous subset of cores and
+the corresponding PMU instance is opened by specifying the 'type' attribute.
+The use of generic event types is not supported in this case.
+
+Have a look at `tools/perf/arch/arm64/tests/user-events.c`_ for an example. It
+can be run using the perf tool to check that the access to the registers works
+correctly from userspace:
+
+.. code-block:: sh
+
+  perf test -v user
+
+About chained events and counter sizes
+--------------------------------------
+The user can request either a 32-bit (config1:0 == 0) or 64-bit (config1:0 == 1)
+counter along with userspace access. The sys_perf_event_open syscall will fail
+if a 64-bit counter is requested and the hardware doesn't support 64-bit
+counters. Chained events are not supported in conjunction with userspace counter
+access. If a 32-bit counter is requested on hardware with 64-bit counters, then
+userspace must treat the upper 32-bits read from the counter as UNKNOWN. The
+'pmc_width' field in the user page will indicate the valid width of the counter
+and should be used to mask the upper bits as needed.
+
+.. Links
+.. _tools/perf/arch/arm64/tests/user-events.c:
+   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/arm64/tests/user-events.c
+.. _tools/lib/perf/tests/test-evsel.c:
+   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/perf/tests/test-evsel.c
diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
index 03137154299e..9d9a4de5bc34 100644
--- a/Documentation/arm64/sve.rst
+++ b/Documentation/arm64/sve.rst
@@ -255,7 +255,7 @@ prctl(PR_SVE_GET_VL)
     vector length change (which would only normally be the case between a
     fork() or vfork() and the corresponding execve() in typical use).
 
-    To extract the vector length from the result, and it with
+    To extract the vector length from the result, bitwise and it with
     PR_SVE_VL_LEN_MASK.
 
     Return value: a nonnegative value on success, or a negative value on error:
diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst
index 0c9120ec58ae..540a1d4fc6c9 100644
--- a/Documentation/arm64/tagged-address-abi.rst
+++ b/Documentation/arm64/tagged-address-abi.rst
@@ -49,7 +49,7 @@ how the user addresses are used by the kernel:
 
    - ``brk()``, ``mmap()`` and the ``new_address`` argument to
      ``mremap()`` as these have the potential to alias with existing
-      user addresses.
+     user addresses.
 
      NOTE: This behaviour changed in v5.6 and so some earlier kernels may
      incorrectly accept valid tagged pointers for the ``brk()``,
diff --git a/Documentation/devicetree/bindings/perf/arm,cmn.yaml b/Documentation/devicetree/bindings/perf/arm,cmn.yaml
index 42424ccbdd0c..2d4219ec7eda 100644
--- a/Documentation/devicetree/bindings/perf/arm,cmn.yaml
+++ b/Documentation/devicetree/bindings/perf/arm,cmn.yaml
@@ -12,12 +12,14 @@ maintainers:
 
 properties:
   compatible:
-    const: arm,cmn-600
+    enum:
+      - arm,cmn-600
+      - arm,ci-700
 
   reg:
     items:
       - description: Physical address of the base (PERIPHBASE) and
-          size (up to 64MB) of the configuration address space.
+          size of the configuration address space.
 
   interrupts:
     minItems: 1
@@ -31,14 +33,23 @@ properties:
 
   arm,root-node:
     $ref: /schemas/types.yaml#/definitions/uint32
-    description: Offset from PERIPHBASE of the configuration
-      discovery node (see TRM definition of ROOTNODEBASE).
+    description: Offset from PERIPHBASE of CMN-600's configuration
+      discovery node (see TRM definition of ROOTNODEBASE). Not
+      relevant for newer CMN/CI products.
 
 required:
   - compatible
   - reg
   - interrupts
-  - arm,root-node
+
+if:
+  properties:
+    compatible:
+      contains:
+        const: arm,cmn-600
+then:
+  required:
+    - arm,root-node
 
 additionalProperties: false
 
diff --git a/Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml b/Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml
new file mode 100644
index 000000000000..a4b53a6a1ebf
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/arm,smmu-v3-pmcg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm SMMUv3 Performance Monitor Counter Group
+
+maintainers:
+  - Will Deacon <will@kernel.org>
+  - Robin Murphy <robin.murphy@arm.com>
+
+description: |
+  An SMMUv3 may have several Performance Monitor Counter Group (PMCG).
+  They are standalone performance monitoring units that support both
+  architected and IMPLEMENTATION DEFINED event counters.
+
+properties:
+  $nodename:
+    pattern: "^pmu@[0-9a-f]*"
+  compatible:
+    oneOf:
+      - items:
+          - const: arm,mmu-600-pmcg
+          - const: arm,smmu-v3-pmcg
+      - const: arm,smmu-v3-pmcg
+
+  reg:
+    items:
+      - description: Register page 0
+      - description: Register page 1, if SMMU_PMCG_CFGR.RELOC_CTRS = 1
+    minItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  msi-parent: true
+
+required:
+  - compatible
+  - reg
+
+anyOf:
+  - required:
+      - interrupts
+  - required:
+      - msi-parent
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    pmu@2b420000 {
+            compatible = "arm,smmu-v3-pmcg";
+            reg = <0x2b420000 0x1000>,
+                  <0x2b430000 0x1000>;
+            interrupts = <GIC_SPI 80 IRQ_TYPE_EDGE_RISING>;
+            msi-parent = <&its 0xff0000>;
+    };
+
+    pmu@2b440000 {
+            compatible = "arm,smmu-v3-pmcg";
+            reg = <0x2b440000 0x1000>,
+                  <0x2b450000 0x1000>;
+            interrupts = <GIC_SPI 81 IRQ_TYPE_EDGE_RISING>;
+            msi-parent = <&its 0xff0000>;
+    };
diff --git a/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml b/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
new file mode 100644
index 000000000000..362142252667
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/marvell-cn10k-tad.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell CN10K LLC-TAD performance monitor
+
+maintainers:
+  - Bhaskara Budiredla <bbudiredla@marvell.com>
+
+description: |
+  The Tag-and-Data units (TADs) maintain coherence and contain CN10K
+  shared on-chip last level cache (LLC). The tad pmu measures the
+  performance of last-level cache. Each tad pmu supports up to eight
+  counters.
+
+  The DT setup comprises of number of tad blocks, the sizes of pmu
+  regions, tad blocks and overall base address of the HW.
+
+properties:
+  compatible:
+    const: marvell,cn10k-tad-pmu
+
+  reg:
+    maxItems: 1
+
+  marvell,tad-cnt:
+    description: specifies the number of tads on the soc
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+  marvell,tad-page-size:
+    description: specifies the size of each tad page
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+  marvell,tad-pmu-page-size:
+    description: specifies the size of page that the pmu uses
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+required:
+  - compatible
+  - reg
+  - marvell,tad-cnt
+  - marvell,tad-page-size
+  - marvell,tad-pmu-page-size
+
+additionalProperties: false
+
+examples:
+  - |
+
+    tad {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        tad_pmu@80000000 {
+            compatible = "marvell,cn10k-tad-pmu";
+            reg = <0x87e2 0x80000000 0x0 0x1000>;
+            marvell,tad-cnt = <1>;
+            marvell,tad-page-size = <0x1000>;
+            marvell,tad-pmu-page-size = <0x1000>;
+        };
+    };
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 7367ada13208..b12df9137e1c 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1950,6 +1950,14 @@ There are some more advanced barrier functions:
      For load from persistent memory, existing read memory barriers are sufficient
      to ensure read ordering.
 
+ (*) io_stop_wc();
+
+     For memory accesses with write-combining attributes (e.g. those returned
+     by ioremap_wc(), the CPU may wait for prior accesses to be merged with
+     subsequent ones. io_stop_wc() can be used to prevent the merging of
+     write-combining memory accesses before this macro with those after it when
+     such wait has performance implications.
+
 ===============================
 IMPLICIT KERNEL MEMORY BARRIERS
 ===============================
diff --git a/MAINTAINERS b/MAINTAINERS
index 59fceb6a6385..1f7658bab9e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8615,8 +8615,10 @@ F:	drivers/misc/hisi_hikey_usb.c
 
 HISILICON PMU DRIVER
 M:	Shaokun Zhang <zhangshaokun@hisilicon.com>
+M:	Qi Liu <liuqi115@huawei.com>
 S:	Supported
 W:	http://www.hisilicon.com
+F:	Documentation/admin-guide/perf/hisi-pcie-pmu.rst
 F:	Documentation/admin-guide/perf/hisi-pmu.rst
 F:	drivers/perf/hisilicon
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..ef3b5cb40d16 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -150,6 +150,8 @@ config ARM64
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE)
+	# Some instrumentation may be unsound, hence EXPERT
+	select HAVE_ARCH_KCSAN if EXPERT
 	select HAVE_ARCH_KFENCE
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
@@ -1545,6 +1547,12 @@ endmenu
 
 menu "ARMv8.2 architectural features"
 
+config AS_HAS_ARMV8_2
+       def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
+
+config AS_HAS_SHA3
+       def_bool $(as-instr,.arch armv8.2-a+sha3)
+
 config ARM64_PMEM
 	bool "Enable support for persistent memory"
 	select ARCH_HAS_PMEM_API
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index e8cfc5868aa8..2f1de88651e6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
 					include/generated/asm-offsets.h))
 endif
 
+ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.2-a
+endif
+
 # Ensure that if the compiler supports branch protection we default it
 # off, this will be overridden if we are using branch protection.
 branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index b495de22bb38..ff01f0167ba2 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -363,15 +363,15 @@ ST5(	mov		v4.16b, vctr.16b		)
 	adr		x16, 1f
 	sub		x16, x16, x12, lsl #3
 	br		x16
-	hint		34			// bti c
+	bti		c
 	mov		v0.d[0], vctr.d[0]
-	hint		34			// bti c
+	bti		c
 	mov		v1.d[0], vctr.d[0]
-	hint		34			// bti c
+	bti		c
 	mov		v2.d[0], vctr.d[0]
-	hint		34			// bti c
+	bti		c
 	mov		v3.d[0], vctr.d[0]
-ST5(	hint		34				)
+ST5(	bti		c				)
 ST5(	mov		v4.d[0], vctr.d[0]		)
 1:	b		2f
 	.previous
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 136d13f3d6e9..e8bd0af0141c 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -791,6 +791,16 @@ alternative_endif
 	.endm
 
 /*
+ * Branch Target Identifier (BTI)
+ */
+	.macro  bti, targets
+	.equ	.L__bti_targets_c, 34
+	.equ	.L__bti_targets_j, 36
+	.equ	.L__bti_targets_jc,38
+	hint	#.L__bti_targets_\targets
+	.endm
+
+/*
  * This macro emits a program property note section identifying
  * architecture features which require special handling, mainly for
  * use in assembly files included in the VDSO.
diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 13869b76b58c..fe0db8d416fb 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -44,11 +44,11 @@ __ll_sc_atomic_##op(int i, atomic_t *v)					\
 									\
 	asm volatile("// atomic_" #op "\n"				\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%w0, %2\n"						\
-"	" #asm_op "	%w0, %w0, %w3\n"				\
-"	stxr	%w1, %w0, %2\n"						\
-"	cbnz	%w1, 1b\n")						\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ldxr	%w0, %2\n"					\
+	"	" #asm_op "	%w0, %w0, %w3\n"			\
+	"	stxr	%w1, %w0, %2\n"					\
+	"	cbnz	%w1, 1b\n")					\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i));				\
 }
@@ -62,12 +62,12 @@ __ll_sc_atomic_##op##_return##name(int i, atomic_t *v)			\
 									\
 	asm volatile("// atomic_" #op "_return" #name "\n"		\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ld" #acq "xr	%w0, %2\n"					\
-"	" #asm_op "	%w0, %w0, %w3\n"				\
-"	st" #rel "xr	%w1, %w0, %2\n"					\
-"	cbnz	%w1, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ld" #acq "xr	%w0, %2\n"				\
+	"	" #asm_op "	%w0, %w0, %w3\n"			\
+	"	st" #rel "xr	%w1, %w0, %2\n"				\
+	"	cbnz	%w1, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -84,12 +84,12 @@ __ll_sc_atomic_fetch_##op##name(int i, atomic_t *v)			\
 									\
 	asm volatile("// atomic_fetch_" #op #name "\n"			\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %3\n"					\
-"1:	ld" #acq "xr	%w0, %3\n"					\
-"	" #asm_op "	%w1, %w0, %w4\n"				\
-"	st" #rel "xr	%w2, %w1, %3\n"					\
-"	cbnz	%w2, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %3\n"				\
+	"1:	ld" #acq "xr	%w0, %3\n"				\
+	"	" #asm_op "	%w1, %w0, %w4\n"			\
+	"	st" #rel "xr	%w2, %w1, %3\n"				\
+	"	cbnz	%w2, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -143,11 +143,11 @@ __ll_sc_atomic64_##op(s64 i, atomic64_t *v)				\
 									\
 	asm volatile("// atomic64_" #op "\n"				\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%0, %2\n"						\
-"	" #asm_op "	%0, %0, %3\n"					\
-"	stxr	%w1, %0, %2\n"						\
-"	cbnz	%w1, 1b")						\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ldxr	%0, %2\n"					\
+	"	" #asm_op "	%0, %0, %3\n"				\
+	"	stxr	%w1, %0, %2\n"					\
+	"	cbnz	%w1, 1b")					\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i));				\
 }
@@ -161,12 +161,12 @@ __ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)		\
 									\
 	asm volatile("// atomic64_" #op "_return" #name "\n"		\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ld" #acq "xr	%0, %2\n"					\
-"	" #asm_op "	%0, %0, %3\n"					\
-"	st" #rel "xr	%w1, %0, %2\n"					\
-"	cbnz	%w1, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ld" #acq "xr	%0, %2\n"				\
+	"	" #asm_op "	%0, %0, %3\n"				\
+	"	st" #rel "xr	%w1, %0, %2\n"				\
+	"	cbnz	%w1, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -176,19 +176,19 @@ __ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)		\
 
 #define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint)\
 static inline long							\
-__ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v)		\
+__ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v)			\
 {									\
 	s64 result, val;						\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_fetch_" #op #name "\n"		\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %3\n"					\
-"1:	ld" #acq "xr	%0, %3\n"					\
-"	" #asm_op "	%1, %0, %4\n"					\
-"	st" #rel "xr	%w2, %1, %3\n"					\
-"	cbnz	%w2, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %3\n"				\
+	"1:	ld" #acq "xr	%0, %3\n"				\
+	"	" #asm_op "	%1, %0, %4\n"				\
+	"	st" #rel "xr	%w2, %1, %3\n"				\
+	"	cbnz	%w2, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -241,14 +241,14 @@ __ll_sc_atomic64_dec_if_positive(atomic64_t *v)
 
 	asm volatile("// atomic64_dec_if_positive\n"
 	__LL_SC_FALLBACK(
-"	prfm	pstl1strm, %2\n"
-"1:	ldxr	%0, %2\n"
-"	subs	%0, %0, #1\n"
-"	b.lt	2f\n"
-"	stlxr	%w1, %0, %2\n"
-"	cbnz	%w1, 1b\n"
-"	dmb	ish\n"
-"2:")
+	"	prfm	pstl1strm, %2\n"
+	"1:	ldxr	%0, %2\n"
+	"	subs	%0, %0, #1\n"
+	"	b.lt	2f\n"
+	"	stlxr	%w1, %0, %2\n"
+	"	cbnz	%w1, 1b\n"
+	"	dmb	ish\n"
+	"2:")
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
 	:
 	: "cc", "memory");
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index da3280f639cd..d955ade5df7c 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -11,13 +11,13 @@
 #define __ASM_ATOMIC_LSE_H
 
 #define ATOMIC_OP(op, asm_op)						\
-static inline void __lse_atomic_##op(int i, atomic_t *v)			\
+static inline void __lse_atomic_##op(int i, atomic_t *v)		\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op "	%w[i], %[v]\n"					\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v));							\
+	"	" #asm_op "	%w[i], %[v]\n"				\
+	: [v] "+Q" (v->counter)						\
+	: [i] "r" (i));							\
 }
 
 ATOMIC_OP(andnot, stclr)
@@ -25,19 +25,27 @@ ATOMIC_OP(or, stset)
 ATOMIC_OP(xor, steor)
 ATOMIC_OP(add, stadd)
 
+static inline void __lse_atomic_sub(int i, atomic_t *v)
+{
+	__lse_atomic_add(-i, v);
+}
+
 #undef ATOMIC_OP
 
 #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)			\
 static inline int __lse_atomic_fetch_##op##name(int i, atomic_t *v)	\
 {									\
+	int old;							\
+									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op #mb "	%w[i], %w[i], %[v]"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
+	"	" #asm_op #mb "	%w[i], %w[old], %[v]"			\
+	: [v] "+Q" (v->counter),					\
+	  [old] "=r" (old)						\
+	: [i] "r" (i)							\
 	: cl);								\
 									\
-	return i;							\
+	return old;							\
 }
 
 #define ATOMIC_FETCH_OPS(op, asm_op)					\
@@ -54,51 +62,46 @@ ATOMIC_FETCH_OPS(add, ldadd)
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_FETCH_OPS
 
-#define ATOMIC_OP_ADD_RETURN(name, mb, cl...)				\
+#define ATOMIC_FETCH_OP_SUB(name)					\
+static inline int __lse_atomic_fetch_sub##name(int i, atomic_t *v)	\
+{									\
+	return __lse_atomic_fetch_add##name(-i, v);			\
+}
+
+ATOMIC_FETCH_OP_SUB(_relaxed)
+ATOMIC_FETCH_OP_SUB(_acquire)
+ATOMIC_FETCH_OP_SUB(_release)
+ATOMIC_FETCH_OP_SUB(        )
+
+#undef ATOMIC_FETCH_OP_SUB
+
+#define ATOMIC_OP_ADD_SUB_RETURN(name)					\
 static inline int __lse_atomic_add_return##name(int i, atomic_t *v)	\
 {									\
-	u32 tmp;							\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	ldadd" #mb "	%w[i], %w[tmp], %[v]\n"			\
-	"	add	%w[i], %w[i], %w[tmp]"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
+	return __lse_atomic_fetch_add##name(i, v) + i;			\
+}									\
 									\
-	return i;							\
+static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
+{									\
+	return __lse_atomic_fetch_sub(i, v) - i;			\
 }
 
-ATOMIC_OP_ADD_RETURN(_relaxed,   )
-ATOMIC_OP_ADD_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_ADD_RETURN(_release,  l, "memory")
-ATOMIC_OP_ADD_RETURN(        , al, "memory")
+ATOMIC_OP_ADD_SUB_RETURN(_relaxed)
+ATOMIC_OP_ADD_SUB_RETURN(_acquire)
+ATOMIC_OP_ADD_SUB_RETURN(_release)
+ATOMIC_OP_ADD_SUB_RETURN(        )
 
-#undef ATOMIC_OP_ADD_RETURN
+#undef ATOMIC_OP_ADD_SUB_RETURN
 
 static inline void __lse_atomic_and(int i, atomic_t *v)
 {
-	asm volatile(
-	__LSE_PREAMBLE
-	"	mvn	%w[i], %w[i]\n"
-	"	stclr	%w[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
+	return __lse_atomic_andnot(~i, v);
 }
 
 #define ATOMIC_FETCH_OP_AND(name, mb, cl...)				\
 static inline int __lse_atomic_fetch_and##name(int i, atomic_t *v)	\
 {									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	mvn	%w[i], %w[i]\n"					\
-	"	ldclr" #mb "	%w[i], %w[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
+	return __lse_atomic_fetch_andnot##name(~i, v);			\
 }
 
 ATOMIC_FETCH_OP_AND(_relaxed,   )
@@ -108,69 +111,14 @@ ATOMIC_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC_FETCH_OP_AND
 
-static inline void __lse_atomic_sub(int i, atomic_t *v)
-{
-	asm volatile(
-	__LSE_PREAMBLE
-	"	neg	%w[i], %w[i]\n"
-	"	stadd	%w[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
-}
-
-#define ATOMIC_OP_SUB_RETURN(name, mb, cl...)				\
-static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
-{									\
-	u32 tmp;							\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%w[i], %w[i]\n"					\
-	"	ldadd" #mb "	%w[i], %w[tmp], %[v]\n"			\
-	"	add	%w[i], %w[i], %w[tmp]"				\
-	: [i] "+&r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);							\
-									\
-	return i;							\
-}
-
-ATOMIC_OP_SUB_RETURN(_relaxed,   )
-ATOMIC_OP_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_SUB_RETURN(_release,  l, "memory")
-ATOMIC_OP_SUB_RETURN(        , al, "memory")
-
-#undef ATOMIC_OP_SUB_RETURN
-
-#define ATOMIC_FETCH_OP_SUB(name, mb, cl...)				\
-static inline int __lse_atomic_fetch_sub##name(int i, atomic_t *v)	\
-{									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%w[i], %w[i]\n"					\
-	"	ldadd" #mb "	%w[i], %w[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC_FETCH_OP_SUB(_relaxed,   )
-ATOMIC_FETCH_OP_SUB(_acquire,  a, "memory")
-ATOMIC_FETCH_OP_SUB(_release,  l, "memory")
-ATOMIC_FETCH_OP_SUB(        , al, "memory")
-
-#undef ATOMIC_FETCH_OP_SUB
-
 #define ATOMIC64_OP(op, asm_op)						\
 static inline void __lse_atomic64_##op(s64 i, atomic64_t *v)		\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op "	%[i], %[v]\n"					\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v));							\
+	"	" #asm_op "	%[i], %[v]\n"				\
+	: [v] "+Q" (v->counter)						\
+	: [i] "r" (i));							\
 }
 
 ATOMIC64_OP(andnot, stclr)
@@ -178,19 +126,27 @@ ATOMIC64_OP(or, stset)
 ATOMIC64_OP(xor, steor)
 ATOMIC64_OP(add, stadd)
 
+static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
+{
+	__lse_atomic64_add(-i, v);
+}
+
 #undef ATOMIC64_OP
 
 #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)			\
 static inline long __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)\
 {									\
+	s64 old;							\
+									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op #mb "	%[i], %[i], %[v]"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
+	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
+	: [v] "+Q" (v->counter),					\
+	  [old] "=r" (old)						\
+	: [i] "r" (i) 							\
 	: cl);								\
 									\
-	return i;							\
+	return old;							\
 }
 
 #define ATOMIC64_FETCH_OPS(op, asm_op)					\
@@ -207,51 +163,46 @@ ATOMIC64_FETCH_OPS(add, ldadd)
 #undef ATOMIC64_FETCH_OP
 #undef ATOMIC64_FETCH_OPS
 
-#define ATOMIC64_OP_ADD_RETURN(name, mb, cl...)				\
+#define ATOMIC64_FETCH_OP_SUB(name)					\
+static inline long __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)	\
+{									\
+	return __lse_atomic64_fetch_add##name(-i, v);			\
+}
+
+ATOMIC64_FETCH_OP_SUB(_relaxed)
+ATOMIC64_FETCH_OP_SUB(_acquire)
+ATOMIC64_FETCH_OP_SUB(_release)
+ATOMIC64_FETCH_OP_SUB(        )
+
+#undef ATOMIC64_FETCH_OP_SUB
+
+#define ATOMIC64_OP_ADD_SUB_RETURN(name)				\
 static inline long __lse_atomic64_add_return##name(s64 i, atomic64_t *v)\
 {									\
-	unsigned long tmp;						\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	ldadd" #mb "	%[i], %x[tmp], %[v]\n"			\
-	"	add	%[i], %[i], %x[tmp]"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
+	return __lse_atomic64_fetch_add##name(i, v) + i;		\
+}									\
 									\
-	return i;							\
+static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)\
+{									\
+	return __lse_atomic64_fetch_sub##name(i, v) - i;		\
 }
 
-ATOMIC64_OP_ADD_RETURN(_relaxed,   )
-ATOMIC64_OP_ADD_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_ADD_RETURN(_release,  l, "memory")
-ATOMIC64_OP_ADD_RETURN(        , al, "memory")
+ATOMIC64_OP_ADD_SUB_RETURN(_relaxed)
+ATOMIC64_OP_ADD_SUB_RETURN(_acquire)
+ATOMIC64_OP_ADD_SUB_RETURN(_release)
+ATOMIC64_OP_ADD_SUB_RETURN(        )
 
-#undef ATOMIC64_OP_ADD_RETURN
+#undef ATOMIC64_OP_ADD_SUB_RETURN
 
 static inline void __lse_atomic64_and(s64 i, atomic64_t *v)
 {
-	asm volatile(
-	__LSE_PREAMBLE
-	"	mvn	%[i], %[i]\n"
-	"	stclr	%[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
+	return __lse_atomic64_andnot(~i, v);
 }
 
 #define ATOMIC64_FETCH_OP_AND(name, mb, cl...)				\
 static inline long __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v)	\
 {									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	mvn	%[i], %[i]\n"					\
-	"	ldclr" #mb "	%[i], %[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
+	return __lse_atomic64_fetch_andnot##name(~i, v);		\
 }
 
 ATOMIC64_FETCH_OP_AND(_relaxed,   )
@@ -261,61 +212,6 @@ ATOMIC64_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC64_FETCH_OP_AND
 
-static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
-{
-	asm volatile(
-	__LSE_PREAMBLE
-	"	neg	%[i], %[i]\n"
-	"	stadd	%[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
-}
-
-#define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)				\
-static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)	\
-{									\
-	unsigned long tmp;						\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%[i], %[i]\n"					\
-	"	ldadd" #mb "	%[i], %x[tmp], %[v]\n"			\
-	"	add	%[i], %[i], %x[tmp]"				\
-	: [i] "+&r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC64_OP_SUB_RETURN(_relaxed,   )
-ATOMIC64_OP_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_SUB_RETURN(_release,  l, "memory")
-ATOMIC64_OP_SUB_RETURN(        , al, "memory")
-
-#undef ATOMIC64_OP_SUB_RETURN
-
-#define ATOMIC64_FETCH_OP_SUB(name, mb, cl...)				\
-static inline long __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)	\
-{									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%[i], %[i]\n"					\
-	"	ldadd" #mb "	%[i], %[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC64_FETCH_OP_SUB(_relaxed,   )
-ATOMIC64_FETCH_OP_SUB(_acquire,  a, "memory")
-ATOMIC64_FETCH_OP_SUB(_release,  l, "memory")
-ATOMIC64_FETCH_OP_SUB(        , al, "memory")
-
-#undef ATOMIC64_FETCH_OP_SUB
-
 static inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
 {
 	unsigned long tmp;
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 1c5a00598458..62217be36217 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -26,6 +26,14 @@
 #define __tsb_csync()	asm volatile("hint #18" : : : "memory")
 #define csdb()		asm volatile("hint #20" : : : "memory")
 
+/*
+ * Data Gathering Hint:
+ * This instruction prevents merging memory accesses with Normal-NC or
+ * Device-GRE attributes before the hint instruction with any memory accesses
+ * appearing after the hint instruction.
+ */
+#define dgh()		asm volatile("hint #6" : : : "memory")
+
 #ifdef CONFIG_ARM64_PSEUDO_NMI
 #define pmr_sync()						\
 	do {							\
@@ -46,6 +54,7 @@
 #define dma_rmb()	dmb(oshld)
 #define dma_wmb()	dmb(oshst)
 
+#define io_stop_wc()	dgh()
 
 #define tsb_csync()								\
 	do {									\
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 0f6d16faa540..a58e366f0b07 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -51,6 +51,7 @@ struct cpuinfo_arm64 {
 	u64		reg_id_aa64dfr1;
 	u64		reg_id_aa64isar0;
 	u64		reg_id_aa64isar1;
+	u64		reg_id_aa64isar2;
 	u64		reg_id_aa64mmfr0;
 	u64		reg_id_aa64mmfr1;
 	u64		reg_id_aa64mmfr2;
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index dbb4b30a5648..cb24385e3632 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -51,8 +51,8 @@ extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
 extern void fpsimd_flush_task_state(struct task_struct *target);
 extern void fpsimd_save_and_flush_cpu_state(void);
 
-/* Maximum VL that SVE VL-agnostic software can transparently support */
-#define SVE_VL_ARCH_MAX 0x100
+/* Maximum VL that SVE/SME VL-agnostic software can transparently support */
+#define VL_ARCH_MAX 0x100
 
 /* Offset of FFR in the SVE register dump */
 static inline size_t sve_ffr_offset(int vl)
@@ -122,7 +122,7 @@ extern void fpsimd_sync_to_sve(struct task_struct *task);
 extern void sve_sync_to_fpsimd(struct task_struct *task);
 extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
 
-extern int sve_set_vector_length(struct task_struct *task,
+extern int vec_set_vector_length(struct task_struct *task, enum vec_type type,
 				 unsigned long vl, unsigned long flags);
 
 extern int sve_set_current_vl(unsigned long arg);
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index b100e0055eab..f68fbb207473 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -106,6 +106,8 @@
 #define KERNEL_HWCAP_BTI		__khwcap2_feature(BTI)
 #define KERNEL_HWCAP_MTE		__khwcap2_feature(MTE)
 #define KERNEL_HWCAP_ECV		__khwcap2_feature(ECV)
+#define KERNEL_HWCAP_AFP		__khwcap2_feature(AFP)
+#define KERNEL_HWCAP_RPRES		__khwcap2_feature(RPRES)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index 9906541a6861..b77e9b3f5371 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -1,48 +1,43 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
+#ifdef __ASSEMBLY__
+#include <asm/assembler.h>
+#endif
+
 #define __ALIGN		.align 2
 #define __ALIGN_STR	".align 2"
 
-#if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__)
-
-/*
- * Since current versions of gas reject the BTI instruction unless we
- * set the architecture version to v8.5 we use the hint instruction
- * instead.
- */
-#define BTI_C hint 34 ;
-
 /*
- * When using in-kernel BTI we need to ensure that PCS-conformant assembly
- * functions have suitable annotations.  Override SYM_FUNC_START to insert
- * a BTI landing pad at the start of everything.
+ * When using in-kernel BTI we need to ensure that PCS-conformant
+ * assembly functions have suitable annotations.  Override
+ * SYM_FUNC_START to insert a BTI landing pad at the start of
+ * everything, the override is done unconditionally so we're more
+ * likely to notice any drift from the overridden definitions.
  */
 #define SYM_FUNC_START(name)				\
 	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_NOALIGN(name)			\
 	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_LOCAL(name)			\
 	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
 	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_WEAK(name)			\
 	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_WEAK_NOALIGN(name)		\
 	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
-	BTI_C
-
-#endif
+	bti c ;
 
 /*
  * Annotate a function as position independent, i.e., safe to be called before
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index 478b9bcf69ad..e4704a403237 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -84,10 +84,12 @@ static inline void __dc_gzva(u64 p)
 static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
 					 bool init)
 {
-	u64 curr, mask, dczid_bs, end1, end2, end3;
+	u64 curr, mask, dczid, dczid_bs, dczid_dzp, end1, end2, end3;
 
 	/* Read DC G(Z)VA block size from the system register. */
-	dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf);
+	dczid = read_cpuid(DCZID_EL0);
+	dczid_bs = 4ul << (dczid & 0xf);
+	dczid_dzp = (dczid >> 4) & 1;
 
 	curr = (u64)__tag_set(addr, tag);
 	mask = dczid_bs - 1;
@@ -106,7 +108,7 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
 	 */
 #define SET_MEMTAG_RANGE(stg_post, dc_gva)		\
 	do {						\
-		if (size >= 2 * dczid_bs) {		\
+		if (!dczid_dzp && size >= 2 * dczid_bs) {\
 			do {				\
 				curr = stg_post(curr);	\
 			} while (curr < end1);		\
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 6564a01cc085..e77cdef9ca29 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -47,6 +47,10 @@ struct stack_info {
  * @prev_type:   The type of stack this frame record was on, or a synthetic
  *               value of STACK_TYPE_UNKNOWN. This is used to detect a
  *               transition from one stack to another.
+ *
+ * @kr_cur:      When KRETPROBES is selected, holds the kretprobe instance
+ *               associated with the most recently encountered replacement lr
+ *               value.
  */
 struct stackframe {
 	unsigned long fp;
@@ -59,9 +63,6 @@ struct stackframe {
 #endif
 };
 
-extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
-extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-			    bool (*fn)(void *, unsigned long), void *data);
 extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 			   const char *loglvl);
 
@@ -146,7 +147,4 @@ static inline bool on_accessible_stack(const struct task_struct *tsk,
 	return false;
 }
 
-void start_backtrace(struct stackframe *frame, unsigned long fp,
-		     unsigned long pc);
-
 #endif	/* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 16b3f1a1d468..4704f5893458 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -182,6 +182,7 @@
 
 #define SYS_ID_AA64ISAR0_EL1		sys_reg(3, 0, 0, 6, 0)
 #define SYS_ID_AA64ISAR1_EL1		sys_reg(3, 0, 0, 6, 1)
+#define SYS_ID_AA64ISAR2_EL1		sys_reg(3, 0, 0, 6, 2)
 
 #define SYS_ID_AA64MMFR0_EL1		sys_reg(3, 0, 0, 7, 0)
 #define SYS_ID_AA64MMFR1_EL1		sys_reg(3, 0, 0, 7, 1)
@@ -771,6 +772,20 @@
 #define ID_AA64ISAR1_GPI_NI			0x0
 #define ID_AA64ISAR1_GPI_IMP_DEF		0x1
 
+/* id_aa64isar2 */
+#define ID_AA64ISAR2_RPRES_SHIFT	4
+#define ID_AA64ISAR2_WFXT_SHIFT		0
+
+#define ID_AA64ISAR2_RPRES_8BIT		0x0
+#define ID_AA64ISAR2_RPRES_12BIT	0x1
+/*
+ * Value 0x1 has been removed from the architecture, and is
+ * reserved, but has not yet been removed from the ARM ARM
+ * as of ARM DDI 0487G.b.
+ */
+#define ID_AA64ISAR2_WFXT_NI		0x0
+#define ID_AA64ISAR2_WFXT_SUPPORTED	0x2
+
 /* id_aa64pfr0 */
 #define ID_AA64PFR0_CSV3_SHIFT		60
 #define ID_AA64PFR0_CSV2_SHIFT		56
@@ -889,6 +904,7 @@
 #endif
 
 /* id_aa64mmfr1 */
+#define ID_AA64MMFR1_AFP_SHIFT		44
 #define ID_AA64MMFR1_ETS_SHIFT		36
 #define ID_AA64MMFR1_TWED_SHIFT		32
 #define ID_AA64MMFR1_XNX_SHIFT		28
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 7b23b16f21ce..f03731847d9d 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -76,5 +76,7 @@
 #define HWCAP2_BTI		(1 << 17)
 #define HWCAP2_MTE		(1 << 18)
 #define HWCAP2_ECV		(1 << 19)
+#define HWCAP2_AFP		(1 << 20)
+#define HWCAP2_RPRES		(1 << 21)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index f3851724fe35..e4dea8db6924 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -22,6 +22,7 @@
 #include <linux/irq_work.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
+#include <linux/libfdt.h>
 #include <linux/smp.h>
 #include <linux/serial_core.h>
 #include <linux/pgtable.h>
@@ -62,29 +63,22 @@ static int __init parse_acpi(char *arg)
 }
 early_param("acpi", parse_acpi);
 
-static int __init dt_scan_depth1_nodes(unsigned long node,
-				       const char *uname, int depth,
-				       void *data)
+static bool __init dt_is_stub(void)
 {
-	/*
-	 * Ignore anything not directly under the root node; we'll
-	 * catch its parent instead.
-	 */
-	if (depth != 1)
-		return 0;
+	int node;
 
-	if (strcmp(uname, "chosen") == 0)
-		return 0;
+	fdt_for_each_subnode(node, initial_boot_params, 0) {
+		const char *name = fdt_get_name(initial_boot_params, node, NULL);
+		if (strcmp(name, "chosen") == 0)
+			continue;
+		if (strcmp(name, "hypervisor") == 0 &&
+		    of_flat_dt_is_compatible(node, "xen,xen"))
+			continue;
 
-	if (strcmp(uname, "hypervisor") == 0 &&
-	    of_flat_dt_is_compatible(node, "xen,xen"))
-		return 0;
+		return false;
+	}
 
-	/*
-	 * This node at depth 1 is neither a chosen node nor a xen node,
-	 * which we do not expect.
-	 */
-	return 1;
+	return true;
 }
 
 /*
@@ -205,8 +199,7 @@ void __init acpi_boot_table_init(void)
 	 *   and ACPI has not been [force] enabled (acpi=on|force)
 	 */
 	if (param_acpi_off ||
-	    (!param_acpi_on && !param_acpi_force &&
-	     of_scan_flat_dt(dt_scan_depth1_nodes, NULL)))
+	    (!param_acpi_on && !param_acpi_force && !dt_is_stub()))
 		goto done;
 
 	/*
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6f3e677d88f1..a46ab3b1c4d5 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -225,6 +225,11 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0),
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
@@ -325,6 +330,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_AFP_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_XNX_SHIFT, 4, 0),
@@ -637,6 +643,7 @@ static const struct __ftr_reg_entry {
 	ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1,
 			       &id_aa64isar1_override),
+	ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2),
 
 	/* Op1 = 0, CRn = 0, CRm = 7 */
 	ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
@@ -933,6 +940,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0);
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
+	init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
@@ -1151,6 +1159,8 @@ void update_cpu_features(int cpu,
 				      info->reg_id_aa64isar0, boot->reg_id_aa64isar0);
 	taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu,
 				      info->reg_id_aa64isar1, boot->reg_id_aa64isar1);
+	taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu,
+				      info->reg_id_aa64isar2, boot->reg_id_aa64isar2);
 
 	/*
 	 * Differing PARange support is fine as long as all peripherals and
@@ -1272,6 +1282,7 @@ u64 __read_sysreg_by_encoding(u32 sys_id)
 	read_sysreg_case(SYS_ID_AA64MMFR2_EL1);
 	read_sysreg_case(SYS_ID_AA64ISAR0_EL1);
 	read_sysreg_case(SYS_ID_AA64ISAR1_EL1);
+	read_sysreg_case(SYS_ID_AA64ISAR2_EL1);
 
 	read_sysreg_case(SYS_CNTFRQ_EL0);
 	read_sysreg_case(SYS_CTR_EL0);
@@ -2476,6 +2487,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE),
 #endif /* CONFIG_ARM64_MTE */
 	HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
+	HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
+	HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 6e27b759056a..591c18a889a5 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -95,6 +95,8 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_BTI]		= "bti",
 	[KERNEL_HWCAP_MTE]		= "mte",
 	[KERNEL_HWCAP_ECV]		= "ecv",
+	[KERNEL_HWCAP_AFP]		= "afp",
+	[KERNEL_HWCAP_RPRES]		= "rpres",
 };
 
 #ifdef CONFIG_COMPAT
@@ -391,6 +393,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1);
 	info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
 	info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
+	info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1);
 	info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
 	info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
 	info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1);
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 8cf970d219f5..e535480a4069 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -77,17 +77,13 @@
 	.endm
 
 SYM_CODE_START(ftrace_regs_caller)
-#ifdef BTI_C
-	BTI_C
-#endif
+	bti	c
 	ftrace_regs_entry	1
 	b	ftrace_common
 SYM_CODE_END(ftrace_regs_caller)
 
 SYM_CODE_START(ftrace_caller)
-#ifdef BTI_C
-	BTI_C
-#endif
+	bti	c
 	ftrace_regs_entry	0
 	b	ftrace_common
 SYM_CODE_END(ftrace_caller)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..772ec2ecf488 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -966,8 +966,10 @@ SYM_CODE_START(__sdei_asm_handler)
 	mov	sp, x1
 
 	mov	x1, x0			// address to complete_and_resume
-	/* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */
-	cmp	x0, #1
+	/* x0 = (x0 <= SDEI_EV_FAILED) ?
+	 * EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME
+	 */
+	cmp	x0, #SDEI_EV_FAILED
 	mov_q	x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE
 	mov_q	x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
 	csel	x0, x2, x3, ls
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index fa244c426f61..f2307d6631eb 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/cpu.h>
 #include <linux/cpu_pm.h>
+#include <linux/ctype.h>
 #include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
@@ -406,12 +407,13 @@ static unsigned int find_supported_vector_length(enum vec_type type,
 
 #if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL)
 
-static int sve_proc_do_default_vl(struct ctl_table *table, int write,
+static int vec_proc_do_default_vl(struct ctl_table *table, int write,
 				  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct vl_info *info = &vl_info[ARM64_VEC_SVE];
+	struct vl_info *info = table->extra1;
+	enum vec_type type = info->type;
 	int ret;
-	int vl = get_sve_default_vl();
+	int vl = get_default_vl(type);
 	struct ctl_table tmp_table = {
 		.data = &vl,
 		.maxlen = sizeof(vl),
@@ -428,7 +430,7 @@ static int sve_proc_do_default_vl(struct ctl_table *table, int write,
 	if (!sve_vl_valid(vl))
 		return -EINVAL;
 
-	set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, vl));
+	set_default_vl(type, find_supported_vector_length(type, vl));
 	return 0;
 }
 
@@ -436,7 +438,8 @@ static struct ctl_table sve_default_vl_table[] = {
 	{
 		.procname	= "sve_default_vector_length",
 		.mode		= 0644,
-		.proc_handler	= sve_proc_do_default_vl,
+		.proc_handler	= vec_proc_do_default_vl,
+		.extra1		= &vl_info[ARM64_VEC_SVE],
 	},
 	{ }
 };
@@ -629,7 +632,7 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
 	__fpsimd_to_sve(sst, fst, vq);
 }
 
-int sve_set_vector_length(struct task_struct *task,
+int vec_set_vector_length(struct task_struct *task, enum vec_type type,
 			  unsigned long vl, unsigned long flags)
 {
 	if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
@@ -640,33 +643,35 @@ int sve_set_vector_length(struct task_struct *task,
 		return -EINVAL;
 
 	/*
-	 * Clamp to the maximum vector length that VL-agnostic SVE code can
-	 * work with.  A flag may be assigned in the future to allow setting
-	 * of larger vector lengths without confusing older software.
+	 * Clamp to the maximum vector length that VL-agnostic code
+	 * can work with.  A flag may be assigned in the future to
+	 * allow setting of larger vector lengths without confusing
+	 * older software.
 	 */
-	if (vl > SVE_VL_ARCH_MAX)
-		vl = SVE_VL_ARCH_MAX;
+	if (vl > VL_ARCH_MAX)
+		vl = VL_ARCH_MAX;
 
-	vl = find_supported_vector_length(ARM64_VEC_SVE, vl);
+	vl = find_supported_vector_length(type, vl);
 
 	if (flags & (PR_SVE_VL_INHERIT |
 		     PR_SVE_SET_VL_ONEXEC))
-		task_set_sve_vl_onexec(task, vl);
+		task_set_vl_onexec(task, type, vl);
 	else
 		/* Reset VL to system default on next exec: */
-		task_set_sve_vl_onexec(task, 0);
+		task_set_vl_onexec(task, type, 0);
 
 	/* Only actually set the VL if not deferred: */
 	if (flags & PR_SVE_SET_VL_ONEXEC)
 		goto out;
 
-	if (vl == task_get_sve_vl(task))
+	if (vl == task_get_vl(task, type))
 		goto out;
 
 	/*
 	 * To ensure the FPSIMD bits of the SVE vector registers are preserved,
 	 * write any live register state back to task_struct, and convert to a
-	 * non-SVE thread.
+	 * regular FPSIMD thread.  Since the vector length can only be changed
+	 * with a syscall we can't be in streaming mode while reconfiguring.
 	 */
 	if (task == current) {
 		get_cpu_fpsimd_context();
@@ -687,10 +692,10 @@ int sve_set_vector_length(struct task_struct *task,
 	 */
 	sve_free(task);
 
-	task_set_sve_vl(task, vl);
+	task_set_vl(task, type, vl);
 
 out:
-	update_tsk_thread_flag(task, TIF_SVE_VL_INHERIT,
+	update_tsk_thread_flag(task, vec_vl_inherit_flag(type),
 			       flags & PR_SVE_VL_INHERIT);
 
 	return 0;
@@ -698,20 +703,21 @@ out:
 
 /*
  * Encode the current vector length and flags for return.
- * This is only required for prctl(): ptrace has separate fields
+ * This is only required for prctl(): ptrace has separate fields.
+ * SVE and SME use the same bits for _ONEXEC and _INHERIT.
  *
- * flags are as for sve_set_vector_length().
+ * flags are as for vec_set_vector_length().
  */
-static int sve_prctl_status(unsigned long flags)
+static int vec_prctl_status(enum vec_type type, unsigned long flags)
 {
 	int ret;
 
 	if (flags & PR_SVE_SET_VL_ONEXEC)
-		ret = task_get_sve_vl_onexec(current);
+		ret = task_get_vl_onexec(current, type);
 	else
-		ret = task_get_sve_vl(current);
+		ret = task_get_vl(current, type);
 
-	if (test_thread_flag(TIF_SVE_VL_INHERIT))
+	if (test_thread_flag(vec_vl_inherit_flag(type)))
 		ret |= PR_SVE_VL_INHERIT;
 
 	return ret;
@@ -729,11 +735,11 @@ int sve_set_current_vl(unsigned long arg)
 	if (!system_supports_sve() || is_compat_task())
 		return -EINVAL;
 
-	ret = sve_set_vector_length(current, vl, flags);
+	ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags);
 	if (ret)
 		return ret;
 
-	return sve_prctl_status(flags);
+	return vec_prctl_status(ARM64_VEC_SVE, flags);
 }
 
 /* PR_SVE_GET_VL */
@@ -742,7 +748,7 @@ int sve_get_current_vl(void)
 	if (!system_supports_sve() || is_compat_task())
 		return -EINVAL;
 
-	return sve_prctl_status(0);
+	return vec_prctl_status(ARM64_VEC_SVE, 0);
 }
 
 static void vec_probe_vqs(struct vl_info *info,
@@ -1107,7 +1113,7 @@ static void fpsimd_flush_thread_vl(enum vec_type type)
 		vl = get_default_vl(type);
 
 	if (WARN_ON(!sve_vl_valid(vl)))
-		vl = SVE_VL_MIN;
+		vl = vl_info[type].min_vl;
 
 	supported_vl = find_supported_vector_length(type, vl);
 	if (WARN_ON(supported_vl != vl))
@@ -1213,7 +1219,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
 /*
  * Load the userland FPSIMD state of 'current' from memory, but only if the
  * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
- * state of 'current'
+ * state of 'current'.  This is called when we are preparing to return to
+ * userspace to ensure that userspace sees a good register state.
  */
 void fpsimd_restore_current_state(void)
 {
@@ -1244,7 +1251,9 @@ void fpsimd_restore_current_state(void)
 /*
  * Load an updated userland FPSIMD state for 'current' from memory and set the
  * flag that indicates that the FPSIMD register contents are the most recent
- * FPSIMD state of 'current'
+ * FPSIMD state of 'current'. This is used by the signal code to restore the
+ * register state when returning from a signal handler in FPSIMD only cases,
+ * any SVE context will be discarded.
  */
 void fpsimd_update_current_state(struct user_fpsimd_state const *state)
 {
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 2758f75d6809..6328308be272 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -7,10 +7,6 @@
  * Ubuntu project, hibernation support for mach-dove
  * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu)
  * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.)
- *  https://lkml.org/lkml/2010/6/18/4
- *  https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html
- *  https://patchwork.kernel.org/patch/96442/
- *
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  */
 #define pr_fmt(x) "hibernate: " x
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 6fb31c117ebe..e16b248699d5 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -104,13 +104,15 @@ static void *kexec_page_alloc(void *arg)
 {
 	struct kimage *kimage = (struct kimage *)arg;
 	struct page *page = kimage_alloc_control_pages(kimage, 0);
+	void *vaddr = NULL;
 
 	if (!page)
 		return NULL;
 
-	memset(page_address(page), 0, PAGE_SIZE);
+	vaddr = page_address(page);
+	memset(vaddr, 0, PAGE_SIZE);
 
-	return page_address(page);
+	return vaddr;
 }
 
 int machine_kexec_post_load(struct kimage *kimage)
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 4a72c2727309..e9b7d99f4e3a 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -5,10 +5,10 @@
  * Copyright (C) 2015 ARM Limited
  */
 #include <linux/perf_event.h>
+#include <linux/stacktrace.h>
 #include <linux/uaccess.h>
 
 #include <asm/pointer_auth.h>
-#include <asm/stacktrace.h>
 
 struct frame_tail {
 	struct frame_tail	__user *fp;
@@ -132,30 +132,21 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 	}
 }
 
-/*
- * Gets called by walk_stackframe() for every stackframe. This will be called
- * whist unwinding the stackframe and is like a subroutine return so we use
- * the PC.
- */
 static bool callchain_trace(void *data, unsigned long pc)
 {
 	struct perf_callchain_entry_ctx *entry = data;
-	perf_callchain_store(entry, pc);
-	return true;
+	return perf_callchain_store(entry, pc) == 0;
 }
 
 void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	struct stackframe frame;
-
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
 
-	start_backtrace(&frame, regs->regs[29], regs->pc);
-	walk_stackframe(current, &frame, callchain_trace, entry);
+	arch_stack_walk(callchain_trace, entry, current, regs);
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index b4044469527e..cab678ed6618 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -285,15 +285,24 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
 
 PMU_FORMAT_ATTR(event, "config:0-15");
 PMU_FORMAT_ATTR(long, "config1:0");
+PMU_FORMAT_ATTR(rdpmc, "config1:1");
+
+static int sysctl_perf_user_access __read_mostly;
 
 static inline bool armv8pmu_event_is_64bit(struct perf_event *event)
 {
 	return event->attr.config1 & 0x1;
 }
 
+static inline bool armv8pmu_event_want_user_access(struct perf_event *event)
+{
+	return event->attr.config1 & 0x2;
+}
+
 static struct attribute *armv8_pmuv3_format_attrs[] = {
 	&format_attr_event.attr,
 	&format_attr_long.attr,
+	&format_attr_rdpmc.attr,
 	NULL,
 };
 
@@ -362,7 +371,7 @@ static const struct attribute_group armv8_pmuv3_caps_attr_group = {
  */
 #define	ARMV8_IDX_CYCLE_COUNTER	0
 #define	ARMV8_IDX_COUNTER0	1
-
+#define	ARMV8_IDX_CYCLE_COUNTER_USER	32
 
 /*
  * We unconditionally enable ARMv8.5-PMU long event counter support
@@ -374,18 +383,22 @@ static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu)
 	return (cpu_pmu->pmuver >= ID_AA64DFR0_PMUVER_8_5);
 }
 
+static inline bool armv8pmu_event_has_user_read(struct perf_event *event)
+{
+	return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT;
+}
+
 /*
  * We must chain two programmable counters for 64 bit events,
  * except when we have allocated the 64bit cycle counter (for CPU
- * cycles event). This must be called only when the event has
- * a counter allocated.
+ * cycles event) or when user space counter access is enabled.
  */
 static inline bool armv8pmu_event_is_chained(struct perf_event *event)
 {
 	int idx = event->hw.idx;
 	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 
-	return !WARN_ON(idx < 0) &&
+	return !armv8pmu_event_has_user_read(event) &&
 	       armv8pmu_event_is_64bit(event) &&
 	       !armv8pmu_has_long_event(cpu_pmu) &&
 	       (idx != ARMV8_IDX_CYCLE_COUNTER);
@@ -718,6 +731,28 @@ static inline u32 armv8pmu_getreset_flags(void)
 	return value;
 }
 
+static void armv8pmu_disable_user_access(void)
+{
+	write_sysreg(0, pmuserenr_el0);
+}
+
+static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
+{
+	int i;
+	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
+
+	/* Clear any unused counters to avoid leaking their contents */
+	for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) {
+		if (i == ARMV8_IDX_CYCLE_COUNTER)
+			write_sysreg(0, pmccntr_el0);
+		else
+			armv8pmu_write_evcntr(i, 0);
+	}
+
+	write_sysreg(0, pmuserenr_el0);
+	write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0);
+}
+
 static void armv8pmu_enable_event(struct perf_event *event)
 {
 	/*
@@ -761,6 +796,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
 
 static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 {
+	struct perf_event_context *task_ctx =
+		this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
+
+	if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
+		armv8pmu_enable_user_access(cpu_pmu);
+	else
+		armv8pmu_disable_user_access();
+
 	/* Enable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
 }
@@ -878,13 +921,16 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
 	if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
 		if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask))
 			return ARMV8_IDX_CYCLE_COUNTER;
+		else if (armv8pmu_event_is_64bit(event) &&
+			   armv8pmu_event_want_user_access(event) &&
+			   !armv8pmu_has_long_event(cpu_pmu))
+				return -EAGAIN;
 	}
 
 	/*
 	 * Otherwise use events counters
 	 */
-	if (armv8pmu_event_is_64bit(event) &&
-	    !armv8pmu_has_long_event(cpu_pmu))
+	if (armv8pmu_event_is_chained(event))
 		return	armv8pmu_get_chain_idx(cpuc, cpu_pmu);
 	else
 		return armv8pmu_get_single_idx(cpuc, cpu_pmu);
@@ -900,6 +946,22 @@ static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
 		clear_bit(idx - 1, cpuc->used_mask);
 }
 
+static int armv8pmu_user_event_idx(struct perf_event *event)
+{
+	if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event))
+		return 0;
+
+	/*
+	 * We remap the cycle counter index to 32 to
+	 * match the offset applied to the rest of
+	 * the counter indices.
+	 */
+	if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER)
+		return ARMV8_IDX_CYCLE_COUNTER_USER;
+
+	return event->hw.idx;
+}
+
 /*
  * Add an event filter to a given event.
  */
@@ -996,6 +1058,25 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
 	if (armv8pmu_event_is_64bit(event))
 		event->hw.flags |= ARMPMU_EVT_64BIT;
 
+	/*
+	 * User events must be allocated into a single counter, and so
+	 * must not be chained.
+	 *
+	 * Most 64-bit events require long counter support, but 64-bit
+	 * CPU_CYCLES events can be placed into the dedicated cycle
+	 * counter when this is free.
+	 */
+	if (armv8pmu_event_want_user_access(event)) {
+		if (!(event->attach_state & PERF_ATTACH_TASK))
+			return -EINVAL;
+		if (armv8pmu_event_is_64bit(event) &&
+		    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
+		    !armv8pmu_has_long_event(armpmu))
+			return -EOPNOTSUPP;
+
+		event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
+	}
+
 	/* Only expose micro/arch events supported by this PMU */
 	if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
 	    && test_bit(hw_event_id, armpmu->pmceid_bitmap)) {
@@ -1104,6 +1185,43 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	return probe.present ? 0 : -ENODEV;
 }
 
+static void armv8pmu_disable_user_access_ipi(void *unused)
+{
+	armv8pmu_disable_user_access();
+}
+
+static int armv8pmu_proc_user_access_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write || sysctl_perf_user_access)
+		return ret;
+
+	on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1);
+	return 0;
+}
+
+static struct ctl_table armv8_pmu_sysctl_table[] = {
+	{
+		.procname       = "perf_user_access",
+		.data		= &sysctl_perf_user_access,
+		.maxlen		= sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= armv8pmu_proc_user_access_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static void armv8_pmu_register_sysctl_table(void)
+{
+	static u32 tbl_registered = 0;
+
+	if (!cmpxchg_relaxed(&tbl_registered, 0, 1))
+		register_sysctl("kernel", armv8_pmu_sysctl_table);
+}
+
 static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 			  int (*map_event)(struct perf_event *event),
 			  const struct attribute_group *events,
@@ -1127,6 +1245,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 	cpu_pmu->filter_match		= armv8pmu_filter_match;
 
+	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;
 	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ?
@@ -1136,6 +1256,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
 			caps : &armv8_pmuv3_caps_attr_group;
 
+	armv8_pmu_register_sysctl_table();
 	return 0;
 }
 
@@ -1145,17 +1266,32 @@ static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name,
 	return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL);
 }
 
-static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_pmuv3",
-				       armv8_pmuv3_map_event);
+#define PMUV3_INIT_SIMPLE(name)						\
+static int name##_pmu_init(struct arm_pmu *cpu_pmu)			\
+{									\
+	return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\
 }
 
-static int armv8_a34_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a34",
-				       armv8_pmuv3_map_event);
-}
+PMUV3_INIT_SIMPLE(armv8_pmuv3)
+
+PMUV3_INIT_SIMPLE(armv8_cortex_a34)
+PMUV3_INIT_SIMPLE(armv8_cortex_a55)
+PMUV3_INIT_SIMPLE(armv8_cortex_a65)
+PMUV3_INIT_SIMPLE(armv8_cortex_a75)
+PMUV3_INIT_SIMPLE(armv8_cortex_a76)
+PMUV3_INIT_SIMPLE(armv8_cortex_a77)
+PMUV3_INIT_SIMPLE(armv8_cortex_a78)
+PMUV3_INIT_SIMPLE(armv9_cortex_a510)
+PMUV3_INIT_SIMPLE(armv9_cortex_a710)
+PMUV3_INIT_SIMPLE(armv8_cortex_x1)
+PMUV3_INIT_SIMPLE(armv9_cortex_x2)
+PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
+PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
+PMUV3_INIT_SIMPLE(armv9_neoverse_n2)
+PMUV3_INIT_SIMPLE(armv8_neoverse_v1)
+
+PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
+PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
 
 static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu)
 {
@@ -1169,24 +1305,12 @@ static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu)
 				       armv8_a53_map_event);
 }
 
-static int armv8_a55_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a55",
-				       armv8_pmuv3_map_event);
-}
-
 static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57",
 				       armv8_a57_map_event);
 }
 
-static int armv8_a65_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a65",
-				       armv8_pmuv3_map_event);
-}
-
 static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72",
@@ -1199,42 +1323,6 @@ static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu)
 				       armv8_a73_map_event);
 }
 
-static int armv8_a75_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a75",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_a76_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a76",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_a77_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a77",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_a78_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a78",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_e1_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_e1",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_n1_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_n1",
-				       armv8_pmuv3_map_event);
-}
-
 static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder",
@@ -1248,23 +1336,31 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
 }
 
 static const struct of_device_id armv8_pmu_of_device_ids[] = {
-	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_init},
-	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_a34_pmu_init},
+	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_pmu_init},
+	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_cortex_a34_pmu_init},
 	{.compatible = "arm,cortex-a35-pmu",	.data = armv8_a35_pmu_init},
 	{.compatible = "arm,cortex-a53-pmu",	.data = armv8_a53_pmu_init},
-	{.compatible = "arm,cortex-a55-pmu",	.data = armv8_a55_pmu_init},
+	{.compatible = "arm,cortex-a55-pmu",	.data = armv8_cortex_a55_pmu_init},
 	{.compatible = "arm,cortex-a57-pmu",	.data = armv8_a57_pmu_init},
-	{.compatible = "arm,cortex-a65-pmu",	.data = armv8_a65_pmu_init},
+	{.compatible = "arm,cortex-a65-pmu",	.data = armv8_cortex_a65_pmu_init},
 	{.compatible = "arm,cortex-a72-pmu",	.data = armv8_a72_pmu_init},
 	{.compatible = "arm,cortex-a73-pmu",	.data = armv8_a73_pmu_init},
-	{.compatible = "arm,cortex-a75-pmu",	.data = armv8_a75_pmu_init},
-	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_a76_pmu_init},
-	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_a77_pmu_init},
-	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_a78_pmu_init},
-	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_e1_pmu_init},
-	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_n1_pmu_init},
+	{.compatible = "arm,cortex-a75-pmu",	.data = armv8_cortex_a75_pmu_init},
+	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_cortex_a76_pmu_init},
+	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_cortex_a77_pmu_init},
+	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_cortex_a78_pmu_init},
+	{.compatible = "arm,cortex-a510-pmu",	.data = armv9_cortex_a510_pmu_init},
+	{.compatible = "arm,cortex-a710-pmu",	.data = armv9_cortex_a710_pmu_init},
+	{.compatible = "arm,cortex-x1-pmu",	.data = armv8_cortex_x1_pmu_init},
+	{.compatible = "arm,cortex-x2-pmu",	.data = armv9_cortex_x2_pmu_init},
+	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_neoverse_e1_pmu_init},
+	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_neoverse_n1_pmu_init},
+	{.compatible = "arm,neoverse-n2-pmu",	.data = armv9_neoverse_n2_pmu_init},
+	{.compatible = "arm,neoverse-v1-pmu",	.data = armv8_neoverse_v1_pmu_init},
 	{.compatible = "cavium,thunder-pmu",	.data = armv8_thunder_pmu_init},
 	{.compatible = "brcm,vulcan-pmu",	.data = armv8_vulcan_pmu_init},
+	{.compatible = "nvidia,carmel-pmu",	.data = armv8_nvidia_carmel_pmu_init},
+	{.compatible = "nvidia,denver-pmu",	.data = armv8_nvidia_denver_pmu_init},
 	{},
 };
 
@@ -1287,7 +1383,7 @@ static int __init armv8_pmu_driver_init(void)
 	if (acpi_disabled)
 		return platform_driver_register(&armv8_pmu_driver);
 	else
-		return arm_pmu_acpi_probe(armv8_pmuv3_init);
+		return arm_pmu_acpi_probe(armv8_pmuv3_pmu_init);
 }
 device_initcall(armv8_pmu_driver_init)
 
@@ -1301,6 +1397,14 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
 	userpg->cap_user_time_short = 0;
+	userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event);
+
+	if (userpg->cap_user_rdpmc) {
+		if (event->hw.flags & ARMPMU_EVT_64BIT)
+			userpg->pmc_width = 64;
+		else
+			userpg->pmc_width = 32;
+	}
 
 	do {
 		rd = sched_clock_read_begin(&seq);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index aacf2f5559a8..5369e649fa79 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -40,6 +40,7 @@
 #include <linux/percpu.h>
 #include <linux/thread_info.h>
 #include <linux/prctl.h>
+#include <linux/stacktrace.h>
 
 #include <asm/alternative.h>
 #include <asm/compat.h>
@@ -439,34 +440,26 @@ static void entry_task_switch(struct task_struct *next)
 
 /*
  * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT.
- * Assuming the virtual counter is enabled at the beginning of times:
- *
- * - disable access when switching from a 64bit task to a 32bit task
- * - enable access when switching from a 32bit task to a 64bit task
+ * Ensure access is disabled when switching to a 32bit task, ensure
+ * access is enabled when switching to a 64bit task.
  */
-static void erratum_1418040_thread_switch(struct task_struct *prev,
-					  struct task_struct *next)
+static void erratum_1418040_thread_switch(struct task_struct *next)
 {
-	bool prev32, next32;
-	u64 val;
-
-	if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040))
-		return;
-
-	prev32 = is_compat_thread(task_thread_info(prev));
-	next32 = is_compat_thread(task_thread_info(next));
-
-	if (prev32 == next32 || !this_cpu_has_cap(ARM64_WORKAROUND_1418040))
+	if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) ||
+	    !this_cpu_has_cap(ARM64_WORKAROUND_1418040))
 		return;
 
-	val = read_sysreg(cntkctl_el1);
-
-	if (!next32)
-		val |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+	if (is_compat_thread(task_thread_info(next)))
+		sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0);
 	else
-		val &= ~ARCH_TIMER_USR_VCT_ACCESS_EN;
+		sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN);
+}
 
-	write_sysreg(val, cntkctl_el1);
+static void erratum_1418040_new_exec(void)
+{
+	preempt_disable();
+	erratum_1418040_thread_switch(current);
+	preempt_enable();
 }
 
 /*
@@ -490,7 +483,8 @@ void update_sctlr_el1(u64 sctlr)
 /*
  * Thread switching.
  */
-__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
+__notrace_funcgraph __sched
+struct task_struct *__switch_to(struct task_struct *prev,
 				struct task_struct *next)
 {
 	struct task_struct *last;
@@ -501,7 +495,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	contextidr_thread_switch(next);
 	entry_task_switch(next);
 	ssbs_thread_switch(next);
-	erratum_1418040_thread_switch(prev, next);
+	erratum_1418040_thread_switch(next);
 	ptrauth_thread_switch_user(next);
 
 	/*
@@ -528,30 +522,37 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	return last;
 }
 
+struct wchan_info {
+	unsigned long	pc;
+	int		count;
+};
+
+static bool get_wchan_cb(void *arg, unsigned long pc)
+{
+	struct wchan_info *wchan_info = arg;
+
+	if (!in_sched_functions(pc)) {
+		wchan_info->pc = pc;
+		return false;
+	}
+	return wchan_info->count++ < 16;
+}
+
 unsigned long __get_wchan(struct task_struct *p)
 {
-	struct stackframe frame;
-	unsigned long stack_page, ret = 0;
-	int count = 0;
+	struct wchan_info wchan_info = {
+		.pc = 0,
+		.count = 0,
+	};
 
-	stack_page = (unsigned long)try_get_task_stack(p);
-	if (!stack_page)
+	if (!try_get_task_stack(p))
 		return 0;
 
-	start_backtrace(&frame, thread_saved_fp(p), thread_saved_pc(p));
-
-	do {
-		if (unwind_frame(p, &frame))
-			goto out;
-		if (!in_sched_functions(frame.pc)) {
-			ret = frame.pc;
-			goto out;
-		}
-	} while (count++ < 16);
+	arch_stack_walk(get_wchan_cb, &wchan_info, p, NULL);
 
-out:
 	put_task_stack(p);
-	return ret;
+
+	return wchan_info.pc;
 }
 
 unsigned long arch_align_stack(unsigned long sp)
@@ -611,6 +612,7 @@ void arch_setup_new_exec(void)
 	current->mm->context.flags = mmflags;
 	ptrauth_thread_init_user();
 	mte_thread_init_user();
+	erratum_1418040_new_exec();
 
 	if (task_spec_ssb_noexec(current)) {
 		arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 88a9034fb9b5..716dde289446 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -812,9 +812,9 @@ static int sve_set(struct task_struct *target,
 
 	/*
 	 * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by
-	 * sve_set_vector_length(), which will also validate them for us:
+	 * vec_set_vector_length(), which will also validate them for us:
 	 */
-	ret = sve_set_vector_length(target, header.vl,
+	ret = vec_set_vector_length(target, ARM64_VEC_SVE, header.vl,
 		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
 	if (ret)
 		goto out;
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index a6d18755652f..68330017d04f 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -9,9 +9,9 @@
 #include <linux/export.h>
 #include <linux/ftrace.h>
 #include <linux/kprobes.h>
+#include <linux/stacktrace.h>
 
 #include <asm/stack_pointer.h>
-#include <asm/stacktrace.h>
 
 struct return_address_data {
 	unsigned int level;
@@ -35,15 +35,11 @@ NOKPROBE_SYMBOL(save_return_addr);
 void *return_address(unsigned int level)
 {
 	struct return_address_data data;
-	struct stackframe frame;
 
 	data.level = level + 2;
 	data.addr = NULL;
 
-	start_backtrace(&frame,
-			(unsigned long)__builtin_frame_address(0),
-			(unsigned long)return_address);
-	walk_stackframe(current, &frame, save_return_addr, &data);
+	arch_stack_walk(save_return_addr, &data, current, NULL);
 
 	if (!data.level)
 		return data.addr;
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index be5f85b0a24d..f70573928f1b 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -189,11 +189,16 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys)
 
 	if (!dt_virt || !early_init_dt_scan(dt_virt)) {
 		pr_crit("\n"
-			"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
+			"Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n"
 			"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
 			"\nPlease check your bootloader.",
 			&dt_phys, dt_virt);
 
+		/*
+		 * Note that in this _really_ early stage we cannot even BUG()
+		 * or oops, so the least terrible thing to do is cpu_relax(),
+		 * or else we could end-up printing non-initialized data, etc.
+		 */
 		while (true)
 			cpu_relax();
 	}
@@ -232,12 +237,14 @@ static void __init request_standard_resources(void)
 		if (memblock_is_nomap(region)) {
 			res->name  = "reserved";
 			res->flags = IORESOURCE_MEM;
+			res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region));
+			res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1;
 		} else {
 			res->name  = "System RAM";
 			res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+			res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
+			res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
 		}
-		res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
-		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
 
 		request_resource(&iomem_resource, res);
 
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 94f83cd44e50..0fb58fed54cb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -33,8 +33,8 @@
  */
 
 
-void start_backtrace(struct stackframe *frame, unsigned long fp,
-		     unsigned long pc)
+static void start_backtrace(struct stackframe *frame, unsigned long fp,
+			    unsigned long pc)
 {
 	frame->fp = fp;
 	frame->pc = pc;
@@ -63,7 +63,8 @@ void start_backtrace(struct stackframe *frame, unsigned long fp,
  * records (e.g. a cycle), determined based on the location and fp value of A
  * and the location (but not the fp value) of B.
  */
-int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
+static int notrace unwind_frame(struct task_struct *tsk,
+				struct stackframe *frame)
 {
 	unsigned long fp = frame->fp;
 	struct stack_info info;
@@ -141,8 +142,9 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 }
 NOKPROBE_SYMBOL(unwind_frame);
 
-void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-			     bool (*fn)(void *, unsigned long), void *data)
+static void notrace walk_stackframe(struct task_struct *tsk,
+				    struct stackframe *frame,
+				    bool (*fn)(void *, unsigned long), void *data)
 {
 	while (1) {
 		int ret;
@@ -156,24 +158,20 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
 }
 NOKPROBE_SYMBOL(walk_stackframe);
 
-static void dump_backtrace_entry(unsigned long where, const char *loglvl)
+static bool dump_backtrace_entry(void *arg, unsigned long where)
 {
+	char *loglvl = arg;
 	printk("%s %pSb\n", loglvl, (void *)where);
+	return true;
 }
 
 void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 		    const char *loglvl)
 {
-	struct stackframe frame;
-	int skip = 0;
-
 	pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
 
-	if (regs) {
-		if (user_mode(regs))
-			return;
-		skip = 1;
-	}
+	if (regs && user_mode(regs))
+		return;
 
 	if (!tsk)
 		tsk = current;
@@ -181,36 +179,8 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 	if (!try_get_task_stack(tsk))
 		return;
 
-	if (tsk == current) {
-		start_backtrace(&frame,
-				(unsigned long)__builtin_frame_address(0),
-				(unsigned long)dump_backtrace);
-	} else {
-		/*
-		 * task blocked in __switch_to
-		 */
-		start_backtrace(&frame,
-				thread_saved_fp(tsk),
-				thread_saved_pc(tsk));
-	}
-
 	printk("%sCall trace:\n", loglvl);
-	do {
-		/* skip until specified stack frame */
-		if (!skip) {
-			dump_backtrace_entry(frame.pc, loglvl);
-		} else if (frame.fp == regs->regs[29]) {
-			skip = 0;
-			/*
-			 * Mostly, this is the case where this function is
-			 * called in panic/abort. As exception handler's
-			 * stack frame does not contain the corresponding pc
-			 * at which an exception has taken place, use regs->pc
-			 * instead.
-			 */
-			dump_backtrace_entry(regs->pc, loglvl);
-		}
-	} while (!unwind_frame(tsk, &frame));
+	arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);
 
 	put_task_stack(tsk);
 }
@@ -221,8 +191,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
 	barrier();
 }
 
-#ifdef CONFIG_STACKTRACE
-
 noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
 			      void *cookie, struct task_struct *task,
 			      struct pt_regs *regs)
@@ -241,5 +209,3 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
 
 	walk_stackframe(task, &frame, consume_entry, cookie);
 }
-
-#endif
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index eebbc8d7123e..b5855eb7435d 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -18,6 +18,7 @@
 #include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/profile.h>
+#include <linux/stacktrace.h>
 #include <linux/syscore_ops.h>
 #include <linux/timer.h>
 #include <linux/irq.h>
@@ -29,25 +30,25 @@
 #include <clocksource/arm_arch_timer.h>
 
 #include <asm/thread_info.h>
-#include <asm/stacktrace.h>
 #include <asm/paravirt.h>
 
-unsigned long profile_pc(struct pt_regs *regs)
+static bool profile_pc_cb(void *arg, unsigned long pc)
 {
-	struct stackframe frame;
+	unsigned long *prof_pc = arg;
 
-	if (!in_lock_functions(regs->pc))
-		return regs->pc;
+	if (in_lock_functions(pc))
+		return true;
+	*prof_pc = pc;
+	return false;
+}
 
-	start_backtrace(&frame, regs->regs[29], regs->pc);
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long prof_pc = 0;
 
-	do {
-		int ret = unwind_frame(NULL, &frame);
-		if (ret < 0)
-			return 0;
-	} while (in_lock_functions(frame.pc));
+	arch_stack_walk(profile_pc_cb, &prof_pc, current, regs);
 
-	return frame.pc;
+	return prof_pc;
 }
 EXPORT_SYMBOL(profile_pc);
 
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 700767dfd221..60813497a381 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -32,6 +32,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) \
 				$(CC_FLAGS_LTO)
 KASAN_SANITIZE			:= n
+KCSAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 KCOV_INSTRUMENT			:= n
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 275a27368a04..5abe0617f2af 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -140,9 +140,12 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Guest access to SVE registers should be routed to this handler only
+ * when the system doesn't support SVE.
+ */
 static int handle_sve(struct kvm_vcpu *vcpu)
 {
-	/* Until SVE is supported for guests: */
 	kvm_inject_undefined(vcpu);
 	return 1;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index c3c11974fa3b..24b2c2425b38 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -89,6 +89,7 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI)
 # cause crashes. Just disable it.
 GCOV_PROFILE	:= n
 KASAN_SANITIZE	:= n
+KCSAN_SANITIZE	:= n
 UBSAN_SANITIZE	:= n
 KCOV_INSTRUMENT	:= n
 
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 426bd7fbc3fd..27386f0d81e4 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -52,10 +52,10 @@ int kvm_arm_init_sve(void)
 		 * The get_sve_reg()/set_sve_reg() ioctl interface will need
 		 * to be extended with multiple register slice support in
 		 * order to support vector lengths greater than
-		 * SVE_VL_ARCH_MAX:
+		 * VL_ARCH_MAX:
 		 */
-		if (WARN_ON(kvm_sve_max_vl > SVE_VL_ARCH_MAX))
-			kvm_sve_max_vl = SVE_VL_ARCH_MAX;
+		if (WARN_ON(kvm_sve_max_vl > VL_ARCH_MAX))
+			kvm_sve_max_vl = VL_ARCH_MAX;
 
 		/*
 		 * Don't even try to make use of vector lengths that
@@ -103,7 +103,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
 	 * set_sve_vls().  Double-check here just to be sure:
 	 */
 	if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl() ||
-		    vl > SVE_VL_ARCH_MAX))
+		    vl > VL_ARCH_MAX))
 		return -EIO;
 
 	buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL_ACCOUNT);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e3ec1a44f94d..4dc2fba316ff 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1525,7 +1525,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	/* CRm=6 */
 	ID_SANITISED(ID_AA64ISAR0_EL1),
 	ID_SANITISED(ID_AA64ISAR1_EL1),
-	ID_UNALLOCATED(6,2),
+	ID_SANITISED(ID_AA64ISAR2_EL1),
 	ID_UNALLOCATED(6,3),
 	ID_UNALLOCATED(6,4),
 	ID_UNALLOCATED(6,5),
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index b84b179edba3..1fd5d790ab80 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -16,6 +16,7 @@
  */
 SYM_FUNC_START_PI(clear_page)
 	mrs	x1, dczid_el0
+	tbnz	x1, #4, 2f	/* Branch if DC ZVA is prohibited */
 	and	w1, w1, #0xf
 	mov	x2, #4
 	lsl	x1, x2, x1
@@ -25,5 +26,14 @@ SYM_FUNC_START_PI(clear_page)
 	tst	x0, #(PAGE_SIZE - 1)
 	b.ne	1b
 	ret
+
+2:	stnp	xzr, xzr, [x0]
+	stnp	xzr, xzr, [x0, #16]
+	stnp	xzr, xzr, [x0, #32]
+	stnp	xzr, xzr, [x0, #48]
+	add	x0, x0, #64
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	2b
+	ret
 SYM_FUNC_END_PI(clear_page)
 EXPORT_SYMBOL(clear_page)
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
index 5b04464c045e..20784ce75def 100644
--- a/arch/arm64/lib/kasan_sw_tags.S
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -38,9 +38,7 @@
  * incremented by 256 prior to return).
  */
 SYM_CODE_START(__hwasan_tag_mismatch)
-#ifdef BTI_C
-	BTI_C
-#endif
+	bti	c
 	add	x29, sp, #232
 	stp	x2, x3, [sp, #8 * 2]
 	stp	x4, x5, [sp, #8 * 4]
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index e83643b3995f..f531dcb95174 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -43,17 +43,23 @@ SYM_FUNC_END(mte_clear_page_tags)
  *	x0 - address to the beginning of the page
  */
 SYM_FUNC_START(mte_zero_clear_page_tags)
+	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
 	mrs	x1, dczid_el0
+	tbnz	x1, #4, 2f	// Branch if DC GZVA is prohibited
 	and	w1, w1, #0xf
 	mov	x2, #4
 	lsl	x1, x2, x1
-	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
 
 1:	dc	gzva, x0
 	add	x0, x0, x1
 	tst	x0, #(PAGE_SIZE - 1)
 	b.ne	1b
 	ret
+
+2:	stz2g	x0, [x0], #(MTE_GRANULE_SIZE * 2)
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	2b
+	ret
 SYM_FUNC_END(mte_zero_clear_page_tags)
 
 /*
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 11bf4f8aca68..d189cf4e70ea 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
 	} while (--lines > 0);
 }
 
-struct xor_block_template const xor_block_inner_neon = {
+struct xor_block_template xor_block_inner_neon __ro_after_init = {
 	.name	= "__inner_neon__",
 	.do_2	= xor_arm64_neon_2,
 	.do_3	= xor_arm64_neon_3,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
 };
 EXPORT_SYMBOL(xor_block_inner_neon);
 
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+	uint64x2_t res;
+
+	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
+	return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 */
+		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4, unsigned long *p5)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+	uint64_t *dp5 = (uint64_t *)p5;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 ^ p5 */
+		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+		dp5 += 8;
+	} while (--lines > 0);
+}
+
+static int __init xor_neon_init(void)
+{
+	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
+		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
+		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
+	}
+	return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 MODULE_LICENSE("GPL");
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 5051b3c1a4f1..7d0563db4201 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -140,15 +140,7 @@ SYM_FUNC_END(dcache_clean_pou)
  *	- start   - kernel start address of region
  *	- end     - kernel end address of region
  */
-SYM_FUNC_START_LOCAL(__dma_inv_area)
 SYM_FUNC_START_PI(dcache_inval_poc)
-	/* FALLTHROUGH */
-
-/*
- *	__dma_inv_area(start, end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	tst	x1, x3				// end cache line aligned?
@@ -167,7 +159,6 @@ SYM_FUNC_START_PI(dcache_inval_poc)
 	dsb	sy
 	ret
 SYM_FUNC_END_PI(dcache_inval_poc)
-SYM_FUNC_END(__dma_inv_area)
 
 /*
  *	dcache_clean_poc(start, end)
@@ -178,19 +169,10 @@ SYM_FUNC_END(__dma_inv_area)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START_LOCAL(__dma_clean_area)
 SYM_FUNC_START_PI(dcache_clean_poc)
-	/* FALLTHROUGH */
-
-/*
- *	__dma_clean_area(start, end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
 	dcache_by_line_op cvac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(dcache_clean_poc)
-SYM_FUNC_END(__dma_clean_area)
 
 /*
  *	dcache_clean_pop(start, end)
@@ -232,8 +214,8 @@ SYM_FUNC_END_PI(__dma_flush_area)
 SYM_FUNC_START_PI(__dma_map_area)
 	add	x1, x0, x1
 	cmp	w2, #DMA_FROM_DEVICE
-	b.eq	__dma_inv_area
-	b	__dma_clean_area
+	b.eq	__pi_dcache_inval_poc
+	b	__pi_dcache_clean_poc
 SYM_FUNC_END_PI(__dma_map_area)
 
 /*
@@ -245,6 +227,6 @@ SYM_FUNC_END_PI(__dma_map_area)
 SYM_FUNC_START_PI(__dma_unmap_area)
 	add	x1, x0, x1
 	cmp	w2, #DMA_TO_DEVICE
-	b.ne	__dma_inv_area
+	b.ne	__pi_dcache_inval_poc
 	ret
 SYM_FUNC_END_PI(__dma_unmap_area)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index cd72576ae2b7..b8b4cf0bcf39 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -35,8 +35,8 @@ static unsigned long *pinned_asid_map;
 #define ASID_FIRST_VERSION	(1UL << asid_bits)
 
 #define NUM_USER_ASIDS		ASID_FIRST_VERSION
-#define asid2idx(asid)		((asid) & ~ASID_MASK)
-#define idx2asid(idx)		asid2idx(idx)
+#define ctxid2asid(asid)	((asid) & ~ASID_MASK)
+#define asid2ctxid(asid, genid)	((asid) | (genid))
 
 /* Get the ASIDBits supported by the current CPU */
 static u32 get_cpu_asid_bits(void)
@@ -50,10 +50,10 @@ static u32 get_cpu_asid_bits(void)
 		pr_warn("CPU%d: Unknown ASID size (%d); assuming 8-bit\n",
 					smp_processor_id(),  fld);
 		fallthrough;
-	case 0:
+	case ID_AA64MMFR0_ASID_8:
 		asid = 8;
 		break;
-	case 2:
+	case ID_AA64MMFR0_ASID_16:
 		asid = 16;
 	}
 
@@ -120,7 +120,7 @@ static void flush_context(void)
 		 */
 		if (asid == 0)
 			asid = per_cpu(reserved_asids, i);
-		__set_bit(asid2idx(asid), asid_map);
+		__set_bit(ctxid2asid(asid), asid_map);
 		per_cpu(reserved_asids, i) = asid;
 	}
 
@@ -162,7 +162,7 @@ static u64 new_context(struct mm_struct *mm)
 	u64 generation = atomic64_read(&asid_generation);
 
 	if (asid != 0) {
-		u64 newasid = generation | (asid & ~ASID_MASK);
+		u64 newasid = asid2ctxid(ctxid2asid(asid), generation);
 
 		/*
 		 * If our current ASID was active during a rollover, we
@@ -183,7 +183,7 @@ static u64 new_context(struct mm_struct *mm)
 		 * We had a valid ASID in a previous life, so try to re-use
 		 * it if possible.
 		 */
-		if (!__test_and_set_bit(asid2idx(asid), asid_map))
+		if (!__test_and_set_bit(ctxid2asid(asid), asid_map))
 			return newasid;
 	}
 
@@ -209,7 +209,7 @@ static u64 new_context(struct mm_struct *mm)
 set_asid:
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
-	return idx2asid(asid) | generation;
+	return asid2ctxid(asid, generation);
 }
 
 void check_and_switch_context(struct mm_struct *mm)
@@ -300,13 +300,13 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
 	}
 
 	nr_pinned_asids++;
-	__set_bit(asid2idx(asid), pinned_asid_map);
+	__set_bit(ctxid2asid(asid), pinned_asid_map);
 	refcount_set(&mm->context.pinned, 1);
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
-	asid &= ~ASID_MASK;
+	asid = ctxid2asid(asid);
 
 	/* Set the equivalent of USER_ASID_BIT */
 	if (asid && arm64_kernel_unmapped_at_el0())
@@ -327,7 +327,7 @@ void arm64_mm_context_put(struct mm_struct *mm)
 	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
 
 	if (refcount_dec_and_test(&mm->context.pinned)) {
-		__clear_bit(asid2idx(asid), pinned_asid_map);
+		__clear_bit(ctxid2asid(asid), pinned_asid_map);
 		nr_pinned_asids--;
 	}
 
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index c3d53811a15e..c0181e60cc98 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -10,9 +10,6 @@
 #include <asm/asm-extable.h>
 #include <asm/ptrace.h>
 
-typedef bool (*ex_handler_t)(const struct exception_table_entry *,
-			     struct pt_regs *);
-
 static inline unsigned long
 get_ex_fixup(const struct exception_table_entry *ex)
 {
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9ae24e3b72be..9a9e7675b187 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -297,6 +297,8 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
 	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
 		 addr);
 
+	kasan_non_canonical_hook(addr);
+
 	mem_abort_decode(esr);
 
 	show_pte(addr);
@@ -813,11 +815,8 @@ void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 	if (!inf->fn(far, esr, regs))
 		return;
 
-	if (!user_mode(regs)) {
-		pr_alert("Unhandled fault at 0x%016lx\n", addr);
-		mem_abort_decode(esr);
-		show_pte(addr);
-	}
+	if (!user_mode(regs))
+		die_kernel_fault(inf->name, addr, esr, regs);
 
 	/*
 	 * At this point we have an unrecognized fault type whose tag bits may
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 5fa68c2ef1f8..b039877c743d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -47,7 +47,7 @@ obj-y				:= cputable.o syscalls.o \
 				   udbg.o misc.o io.o misc_$(BITS).o \
 				   of_platform.o prom_parse.o firmware.o \
 				   hw_breakpoint_constraints.o interrupt.o \
-				   kdebugfs.o
+				   kdebugfs.o stacktrace.o
 obj-y				+= ptrace/
 obj-$(CONFIG_PPC64)		+= setup_64.o \
 				   paca.o nvram_64.o note.o
@@ -116,7 +116,6 @@ obj-$(CONFIG_OPTPROBES)		+= optprobes.o optprobes_head.o
 obj-$(CONFIG_KPROBES_ON_FTRACE)	+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_SWIOTLB)		+= dma-swiotlb.o
 obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
 
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index 0fcdc0233fac..201ee206fb57 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -139,12 +139,8 @@ unsigned long __get_wchan(struct task_struct *task)
 	return pc;
 }
 
-#ifdef CONFIG_STACKTRACE
-
 noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 		     struct task_struct *task, struct pt_regs *regs)
 {
 	walk_stackframe(task, regs, consume_entry, cookie);
 }
-
-#endif /* CONFIG_STACKTRACE */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 80f500ffb55c..be8007f367aa 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -40,7 +40,7 @@ obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y	+= smp.o text_amode31.o
+obj-y	+= smp.o text_amode31.o stacktrace.o
 
 extra-y				+= head64.o vmlinux.lds
 
@@ -55,7 +55,6 @@ compat-obj-$(CONFIG_AUDIT)	+= compat_audit.o
 obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o
 obj-$(CONFIG_COMPAT)		+= $(compat-obj-y)
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_KPROBES)		+= kprobes_insn_page.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38b2c779146f..68dea7ce6a22 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
 
 	if (READ_ONCE(x86_pmu.attr_rdpmc) &&
 	    !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
-		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+		event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
 
 	return err;
 }
@@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void)
 
 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 {
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+	if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		return;
 
 	/*
@@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 
 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
 {
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+	if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		return;
 
 	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+	if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		return 0;
 
 	if (is_metric_idx(hwc->idx))
@@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
 	userpg->cap_user_rdpmc =
-		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+		!!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
 	if (!using_native_sched_clock() || !sched_clock_stable())
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5480db242083..9d376e528dfc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x0010 /* haswell style datala, unknown */
 #define PERF_X86_EVENT_EXCL		0x0020 /* HT exclusivity on counter */
 #define PERF_X86_EVENT_DYNAMIC		0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0080 /* grant rdpmc permission */
+
 #define PERF_X86_EVENT_EXCL_ACCT	0x0100 /* accounted EXCL event */
 #define PERF_X86_EVENT_AUTO_RELOAD	0x0200 /* use PEBS auto-reload */
 #define PERF_X86_EVENT_LARGE_PEBS	0x0400 /* use large PEBS */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ff3e600f426..6aef9ee28a39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
 obj-$(CONFIG_INTEL_TXT)		+= tboot.o
 obj-$(CONFIG_ISA_DMA_API)	+= i8237.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
+obj-y				+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
 obj-y				+= reboot.o
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 4374af292e6d..e1a0c44bc686 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -43,7 +43,7 @@ config ARM_CCN
 
 config ARM_CMN
 	tristate "Arm CMN-600 PMU support"
-	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	depends on ARM64 || COMPILE_TEST
 	help
 	  Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh
 	  Network interconnect.
@@ -139,6 +139,13 @@ config ARM_DMC620_PMU
 	  Support for PMU events monitoring on the ARM DMC-620 memory
 	  controller.
 
+config MARVELL_CN10K_TAD_PMU
+	tristate "Marvell CN10K LLC-TAD PMU"
+	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	help
+	  Provides support for Last-Level cache Tag-and-data Units (LLC-TAD)
+	  performance monitors on CN10K family silicons.
+
 source "drivers/perf/hisilicon/Kconfig"
 
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 5260b116c7da..2db5418d5b0a 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
+obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index bc3cba5f8c5d..0e48adce57ef 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -5,8 +5,10 @@
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -23,7 +25,10 @@
 #define CMN_NI_LOGICAL_ID		GENMASK_ULL(47, 32)
 
 #define CMN_NODEID_DEVID(reg)		((reg) & 3)
+#define CMN_NODEID_EXT_DEVID(reg)	((reg) & 1)
 #define CMN_NODEID_PID(reg)		(((reg) >> 2) & 1)
+#define CMN_NODEID_EXT_PID(reg)		(((reg) >> 1) & 3)
+#define CMN_NODEID_1x1_PID(reg)		(((reg) >> 2) & 7)
 #define CMN_NODEID_X(reg, bits)		((reg) >> (3 + (bits)))
 #define CMN_NODEID_Y(reg, bits)		(((reg) >> 3) & ((1U << (bits)) - 1))
 
@@ -34,20 +39,28 @@
 #define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
-#define CMN_ADDR_NODE_PTR		GENMASK(27, 14)
+#define CMN_MAX_DIMENSION		8
+#define CMN_MAX_XPS			(CMN_MAX_DIMENSION * CMN_MAX_DIMENSION)
+#define CMN_MAX_DTMS			(CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4)
 
-#define CMN_NODE_PTR_DEVID(ptr)		(((ptr) >> 2) & 3)
-#define CMN_NODE_PTR_PID(ptr)		((ptr) & 1)
-#define CMN_NODE_PTR_X(ptr, bits)	((ptr) >> (6 + (bits)))
-#define CMN_NODE_PTR_Y(ptr, bits)	(((ptr) >> 6) & ((1U << (bits)) - 1))
-
-#define CMN_MAX_XPS			(8 * 8)
-
-/* The CFG node has one other useful purpose */
+/* The CFG node has various info besides the discovery tree */
 #define CMN_CFGM_PERIPH_ID_2		0x0010
 #define CMN_CFGM_PID2_REVISION		GENMASK(7, 4)
 
-/* PMU registers occupy the 3rd 4KB page of each node's 16KB space */
+#define CMN_CFGM_INFO_GLOBAL		0x900
+#define CMN_INFO_MULTIPLE_DTM_EN	BIT_ULL(63)
+#define CMN_INFO_RSP_VC_NUM		GENMASK_ULL(53, 52)
+#define CMN_INFO_DAT_VC_NUM		GENMASK_ULL(51, 50)
+
+/* XPs also have some local topology info which has uses too */
+#define CMN_MXP__CONNECT_INFO_P0	0x0008
+#define CMN_MXP__CONNECT_INFO_P1	0x0010
+#define CMN_MXP__CONNECT_INFO_P2	0x0028
+#define CMN_MXP__CONNECT_INFO_P3	0x0030
+#define CMN_MXP__CONNECT_INFO_P4	0x0038
+#define CMN_MXP__CONNECT_INFO_P5	0x0040
+
+/* PMU registers occupy the 3rd 4KB page of each node's region */
 #define CMN_PMU_OFFSET			0x2000
 
 /* For most nodes, this is all there is */
@@ -57,6 +70,7 @@
 /* DTMs live in the PMU space of XP registers */
 #define CMN_DTM_WPn(n)			(0x1A0 + (n) * 0x18)
 #define CMN_DTM_WPn_CONFIG(n)		(CMN_DTM_WPn(n) + 0x00)
+#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2	GENMASK_ULL(18,17)
 #define CMN_DTM_WPn_CONFIG_WP_COMBINE	BIT(6)
 #define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE	BIT(5)
 #define CMN_DTM_WPn_CONFIG_WP_GRP	BIT(4)
@@ -81,7 +95,11 @@
 
 #define CMN_DTM_PMEVCNTSR		0x240
 
+#define CMN_DTM_UNIT_INFO		0x0910
+
 #define CMN_DTM_NUM_COUNTERS		4
+/* Want more local counters? Why not replicate the whole DTM! Ugh... */
+#define CMN_DTM_OFFSET(n)		((n) * 0x200)
 
 /* The DTC node is where the magic happens */
 #define CMN_DT_DTC_CTL			0x0a00
@@ -122,11 +140,11 @@
 
 
 /* Event attributes */
-#define CMN_CONFIG_TYPE			GENMASK(15, 0)
-#define CMN_CONFIG_EVENTID		GENMASK(23, 16)
-#define CMN_CONFIG_OCCUPID		GENMASK(27, 24)
-#define CMN_CONFIG_BYNODEID		BIT(31)
-#define CMN_CONFIG_NODEID		GENMASK(47, 32)
+#define CMN_CONFIG_TYPE			GENMASK_ULL(15, 0)
+#define CMN_CONFIG_EVENTID		GENMASK_ULL(23, 16)
+#define CMN_CONFIG_OCCUPID		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_BYNODEID		BIT_ULL(31)
+#define CMN_CONFIG_NODEID		GENMASK_ULL(47, 32)
 
 #define CMN_EVENT_TYPE(event)		FIELD_GET(CMN_CONFIG_TYPE, (event)->attr.config)
 #define CMN_EVENT_EVENTID(event)	FIELD_GET(CMN_CONFIG_EVENTID, (event)->attr.config)
@@ -134,13 +152,13 @@
 #define CMN_EVENT_BYNODEID(event)	FIELD_GET(CMN_CONFIG_BYNODEID, (event)->attr.config)
 #define CMN_EVENT_NODEID(event)		FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config)
 
-#define CMN_CONFIG_WP_COMBINE		GENMASK(27, 24)
-#define CMN_CONFIG_WP_DEV_SEL		BIT(48)
-#define CMN_CONFIG_WP_CHN_SEL		GENMASK(50, 49)
-#define CMN_CONFIG_WP_GRP		BIT(52)
-#define CMN_CONFIG_WP_EXCLUSIVE		BIT(53)
-#define CMN_CONFIG1_WP_VAL		GENMASK(63, 0)
-#define CMN_CONFIG2_WP_MASK		GENMASK(63, 0)
+#define CMN_CONFIG_WP_COMBINE		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_WP_DEV_SEL		GENMASK_ULL(50, 48)
+#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(55, 51)
+#define CMN_CONFIG_WP_GRP		BIT_ULL(56)
+#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(57)
+#define CMN_CONFIG1_WP_VAL		GENMASK_ULL(63, 0)
+#define CMN_CONFIG2_WP_MASK		GENMASK_ULL(63, 0)
 
 #define CMN_EVENT_WP_COMBINE(event)	FIELD_GET(CMN_CONFIG_WP_COMBINE, (event)->attr.config)
 #define CMN_EVENT_WP_DEV_SEL(event)	FIELD_GET(CMN_CONFIG_WP_DEV_SEL, (event)->attr.config)
@@ -155,7 +173,13 @@
 #define CMN_WP_DOWN			2
 
 
-/* r0px probably don't exist in silicon, thankfully */
+enum cmn_model {
+	CMN_ANY = -1,
+	CMN600 = 1,
+	CI700 = 2,
+};
+
+/* CMN-600 r0px shouldn't exist in silicon, thankfully */
 enum cmn_revision {
 	CMN600_R1P0,
 	CMN600_R1P1,
@@ -163,6 +187,10 @@ enum cmn_revision {
 	CMN600_R1P3,
 	CMN600_R2P0,
 	CMN600_R3P0,
+	CMN600_R3P1,
+	CI700_R0P0 = 0,
+	CI700_R1P0,
+	CI700_R2P0,
 };
 
 enum cmn_node_type {
@@ -174,9 +202,12 @@ enum cmn_node_type {
 	CMN_TYPE_HNF,
 	CMN_TYPE_XP,
 	CMN_TYPE_SBSX,
-	CMN_TYPE_RNI = 0xa,
+	CMN_TYPE_MPAM_S,
+	CMN_TYPE_MPAM_NS,
+	CMN_TYPE_RNI,
 	CMN_TYPE_RND = 0xd,
 	CMN_TYPE_RNSAM = 0xf,
+	CMN_TYPE_MTSX,
 	CMN_TYPE_CXRA = 0x100,
 	CMN_TYPE_CXHA = 0x101,
 	CMN_TYPE_CXLA = 0x102,
@@ -189,32 +220,32 @@ struct arm_cmn_node {
 	u16 id, logid;
 	enum cmn_node_type type;
 
+	int dtm;
 	union {
-		/* Device node */
+		/* DN/HN-F/CXHA */
 		struct {
-			int to_xp;
-			/* DN/HN-F/CXHA */
-			unsigned int occupid_val;
-			unsigned int occupid_count;
+			u8 occupid_val;
+			u8 occupid_count;
 		};
 		/* XP */
-		struct {
-			int dtc;
-			u32 pmu_config_low;
-			union {
-				u8 input_sel[4];
-				__le32 pmu_config_high;
-			};
-			s8 wp_event[4];
-		};
+		u8 dtc;
 	};
-
 	union {
 		u8 event[4];
 		__le32 event_sel;
 	};
 };
 
+struct arm_cmn_dtm {
+	void __iomem *base;
+	u32 pmu_config_low;
+	union {
+		u8 input_sel[4];
+		__le32 pmu_config_high;
+	};
+	s8 wp_event[4];
+};
+
 struct arm_cmn_dtc {
 	void __iomem *base;
 	int irq;
@@ -231,35 +262,238 @@ struct arm_cmn_dtc {
 struct arm_cmn {
 	struct device *dev;
 	void __iomem *base;
+	unsigned int state;
 
 	enum cmn_revision rev;
+	enum cmn_model model;
 	u8 mesh_x;
 	u8 mesh_y;
 	u16 num_xps;
 	u16 num_dns;
+	bool multi_dtm;
+	u8 ports_used;
+	struct {
+		unsigned int rsp_vc_num : 2;
+		unsigned int dat_vc_num : 2;
+	};
+
 	struct arm_cmn_node *xps;
 	struct arm_cmn_node *dns;
 
+	struct arm_cmn_dtm *dtms;
 	struct arm_cmn_dtc *dtc;
 	unsigned int num_dtcs;
 
 	int cpu;
 	struct hlist_node cpuhp_node;
 
-	unsigned int state;
 	struct pmu pmu;
+	struct dentry *debug;
 };
 
 #define to_cmn(p)	container_of(p, struct arm_cmn, pmu)
 
 static int arm_cmn_hp_state;
 
+struct arm_cmn_nodeid {
+	u8 x;
+	u8 y;
+	u8 port;
+	u8 dev;
+};
+
+static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
+{
+	int dim = max(cmn->mesh_x, cmn->mesh_y);
+
+	return dim > 4 ? 3 : 2;
+}
+
+static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
+{
+	struct arm_cmn_nodeid nid;
+
+	if (cmn->num_xps == 1) {
+		nid.x = 0;
+		nid.y = 0;
+		nid.port = CMN_NODEID_1x1_PID(id);
+		nid.dev = CMN_NODEID_DEVID(id);
+	} else {
+		int bits = arm_cmn_xyidbits(cmn);
+
+		nid.x = CMN_NODEID_X(id, bits);
+		nid.y = CMN_NODEID_Y(id, bits);
+		if (cmn->ports_used & 0xc) {
+			nid.port = CMN_NODEID_EXT_PID(id);
+			nid.dev = CMN_NODEID_EXT_DEVID(id);
+		} else {
+			nid.port = CMN_NODEID_PID(id);
+			nid.dev = CMN_NODEID_DEVID(id);
+		}
+	}
+	return nid;
+}
+
+static struct arm_cmn_node *arm_cmn_node_to_xp(const struct arm_cmn *cmn,
+					       const struct arm_cmn_node *dn)
+{
+	struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+	int xp_idx = cmn->mesh_x * nid.y + nid.x;
+
+	return cmn->xps + xp_idx;
+}
+static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
+					 enum cmn_node_type type)
+{
+	struct arm_cmn_node *dn;
+
+	for (dn = cmn->dns; dn->type; dn++)
+		if (dn->type == type)
+			return dn;
+	return NULL;
+}
+
+struct dentry *arm_cmn_debugfs;
+
+#ifdef CONFIG_DEBUG_FS
+static const char *arm_cmn_device_type(u8 type)
+{
+	switch(type) {
+		case 0x01: return "  RN-I  |";
+		case 0x02: return "  RN-D  |";
+		case 0x04: return " RN-F_B |";
+		case 0x05: return "RN-F_B_E|";
+		case 0x06: return " RN-F_A |";
+		case 0x07: return "RN-F_A_E|";
+		case 0x08: return "  HN-T  |";
+		case 0x09: return "  HN-I  |";
+		case 0x0a: return "  HN-D  |";
+		case 0x0c: return "  SN-F  |";
+		case 0x0d: return "  SBSX  |";
+		case 0x0e: return "  HN-F  |";
+		case 0x0f: return " SN-F_E |";
+		case 0x10: return " SN-F_D |";
+		case 0x11: return "  CXHA  |";
+		case 0x12: return "  CXRA  |";
+		case 0x13: return "  CXRH  |";
+		case 0x14: return " RN-F_D |";
+		case 0x15: return "RN-F_D_E|";
+		case 0x16: return " RN-F_C |";
+		case 0x17: return "RN-F_C_E|";
+		case 0x1c: return "  MTSX  |";
+		default:   return "        |";
+	}
+}
+
+static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d)
+{
+	struct arm_cmn *cmn = s->private;
+	struct arm_cmn_node *dn;
+
+	for (dn = cmn->dns; dn->type; dn++) {
+		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+
+		if (dn->type == CMN_TYPE_XP)
+			continue;
+		/* Ignore the extra components that will overlap on some ports */
+		if (dn->type < CMN_TYPE_HNI)
+			continue;
+
+		if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d)
+			continue;
+
+		seq_printf(s, "   #%-2d  |", dn->logid);
+		return;
+	}
+	seq_puts(s, "        |");
+}
+
+static int arm_cmn_map_show(struct seq_file *s, void *data)
+{
+	struct arm_cmn *cmn = s->private;
+	int x, y, p, pmax = fls(cmn->ports_used);
+
+	seq_puts(s, "     X");
+	for (x = 0; x < cmn->mesh_x; x++)
+		seq_printf(s, "    %d    ", x);
+	seq_puts(s, "\nY P D+");
+	y = cmn->mesh_y;
+	while (y--) {
+		int xp_base = cmn->mesh_x * y;
+		u8 port[6][CMN_MAX_DIMENSION];
+
+		for (x = 0; x < cmn->mesh_x; x++)
+			seq_puts(s, "--------+");
+
+		seq_printf(s, "\n%d    |", y);
+		for (x = 0; x < cmn->mesh_x; x++) {
+			struct arm_cmn_node *xp = cmn->xps + xp_base + x;
+			void __iomem *base = xp->pmu_base - CMN_PMU_OFFSET;
+
+			port[0][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P0);
+			port[1][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P1);
+			port[2][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P2);
+			port[3][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P3);
+			port[4][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P4);
+			port[5][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P5);
+			seq_printf(s, " XP #%-2d |", xp_base + x);
+		}
+
+		seq_puts(s, "\n     |");
+		for (x = 0; x < cmn->mesh_x; x++) {
+			u8 dtc = cmn->xps[xp_base + x].dtc;
+
+			if (dtc & (dtc - 1))
+				seq_puts(s, " DTC ?? |");
+			else
+				seq_printf(s, " DTC %ld  |", __ffs(dtc));
+		}
+		seq_puts(s, "\n     |");
+		for (x = 0; x < cmn->mesh_x; x++)
+			seq_puts(s, "........|");
+
+		for (p = 0; p < pmax; p++) {
+			seq_printf(s, "\n  %d  |", p);
+			for (x = 0; x < cmn->mesh_x; x++)
+				seq_puts(s, arm_cmn_device_type(port[p][x]));
+			seq_puts(s, "\n    0|");
+			for (x = 0; x < cmn->mesh_x; x++)
+				arm_cmn_show_logid(s, x, y, p, 0);
+			seq_puts(s, "\n    1|");
+			for (x = 0; x < cmn->mesh_x; x++)
+				arm_cmn_show_logid(s, x, y, p, 1);
+		}
+		seq_puts(s, "\n-----+");
+	}
+	for (x = 0; x < cmn->mesh_x; x++)
+		seq_puts(s, "--------+");
+	seq_puts(s, "\n");
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(arm_cmn_map);
+
+static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id)
+{
+	const char *name  = "map";
+
+	if (id > 0)
+		name = devm_kasprintf(cmn->dev, GFP_KERNEL, "map_%d", id);
+	if (!name)
+		return;
+
+	cmn->debug = debugfs_create_file(name, 0444, arm_cmn_debugfs, cmn, &arm_cmn_map_fops);
+}
+#else
+static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
+#endif
+
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
 	u64 dtm_idx[2];
 	unsigned int dtc_idx;
 	u8 dtcs_used;
 	u8 num_dns;
+	u8 dtm_offset;
 };
 
 #define for_each_hw_dn(hw, dn, i) \
@@ -283,6 +517,7 @@ static unsigned int arm_cmn_get_index(u64 x[], unsigned int pos)
 
 struct arm_cmn_event_attr {
 	struct device_attribute attr;
+	enum cmn_model model;
 	enum cmn_node_type type;
 	u8 eventid;
 	u8 occupid;
@@ -294,50 +529,22 @@ struct arm_cmn_format_attr {
 	int config;
 };
 
-static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
-{
-	return cmn->mesh_x > 4 || cmn->mesh_y > 4 ? 3 : 2;
-}
-
-static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
-				    struct arm_cmn_node *dn)
-{
-	int bits = arm_cmn_xyidbits(cmn);
-	int x = CMN_NODEID_X(dn->id, bits);
-	int y = CMN_NODEID_Y(dn->id, bits);
-	int xp_idx = cmn->mesh_x * y + x;
-
-	dn->to_xp = (cmn->xps + xp_idx) - dn;
-}
-
-static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
-{
-	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
-}
-
-static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
-					 enum cmn_node_type type)
-{
-	int i;
-
-	for (i = 0; i < cmn->num_dns; i++)
-		if (cmn->dns[i].type == type)
-			return &cmn->dns[i];
-	return NULL;
-}
-
-#define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid)		\
+#define CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid)	\
 	(&((struct arm_cmn_event_attr[]) {{				\
 		.attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL),	\
+		.model = _model,					\
 		.type = _type,						\
 		.eventid = _eventid,					\
 		.occupid = _occupid,					\
 	}})[0].attr.attr)
 
-static bool arm_cmn_is_occup_event(enum cmn_node_type type, unsigned int id)
+static bool arm_cmn_is_occup_event(enum cmn_model model,
+				   enum cmn_node_type type, unsigned int id)
 {
-	return (type == CMN_TYPE_DVM && id == 0x05) ||
-	       (type == CMN_TYPE_HNF && id == 0x0f);
+	if (type == CMN_TYPE_DVM)
+		return (model == CMN600 && id == 0x05) ||
+		       (model == CI700 && id == 0x0c);
+	return type == CMN_TYPE_HNF && id == 0x0f;
 }
 
 static ssize_t arm_cmn_event_show(struct device *dev,
@@ -355,7 +562,7 @@ static ssize_t arm_cmn_event_show(struct device *dev,
 				  "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n",
 				  eattr->type, eattr->eventid);
 
-	if (arm_cmn_is_occup_event(eattr->type, eattr->eventid))
+	if (arm_cmn_is_occup_event(eattr->model, eattr->type, eattr->eventid))
 		return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n",
 				  eattr->type, eattr->eventid, eattr->occupid);
 
@@ -370,60 +577,81 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev));
 	struct arm_cmn_event_attr *eattr;
-	enum cmn_node_type type;
 
 	eattr = container_of(attr, typeof(*eattr), attr.attr);
-	type = eattr->type;
 
-	/* Watchpoints aren't nodes */
-	if (type == CMN_TYPE_WP)
-		type = CMN_TYPE_XP;
+	if (!(eattr->model & cmn->model))
+		return 0;
+
+	/* Watchpoints aren't nodes, so avoid confusion */
+	if (eattr->type == CMN_TYPE_WP)
+		return attr->mode;
+
+	/* Hide XP events for unused interfaces/channels */
+	if (eattr->type == CMN_TYPE_XP) {
+		unsigned int intf = (eattr->eventid >> 2) & 7;
+		unsigned int chan = eattr->eventid >> 5;
+
+		if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3)))
+			return 0;
+
+		if ((chan == 5 && cmn->rsp_vc_num < 2) ||
+		    (chan == 6 && cmn->dat_vc_num < 2))
+			return 0;
+	}
 
 	/* Revision-specific differences */
-	if (cmn->rev < CMN600_R1P2) {
-		if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
+	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
+		if (eattr->type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
 			return 0;
 	}
 
-	if (!arm_cmn_node(cmn, type))
+	if (!arm_cmn_node(cmn, eattr->type))
 		return 0;
 
 	return attr->mode;
 }
 
-#define _CMN_EVENT_DVM(_name, _event, _occup)			\
-	CMN_EVENT_ATTR(dn_##_name, CMN_TYPE_DVM, _event, _occup)
+#define _CMN_EVENT_DVM(_model, _name, _event, _occup)		\
+	CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup)
 #define CMN_EVENT_DTC(_name)					\
-	CMN_EVENT_ATTR(dtc_##_name, CMN_TYPE_DTC, 0, 0)
-#define _CMN_EVENT_HNF(_name, _event, _occup)			\
-	CMN_EVENT_ATTR(hnf_##_name, CMN_TYPE_HNF, _event, _occup)
+	CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0, 0)
+#define _CMN_EVENT_HNF(_model, _name, _event, _occup)		\
+	CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup)
 #define CMN_EVENT_HNI(_name, _event)				\
-	CMN_EVENT_ATTR(hni_##_name, CMN_TYPE_HNI, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event, 0)
 #define __CMN_EVENT_XP(_name, _event)				\
-	CMN_EVENT_ATTR(mxp_##_name, CMN_TYPE_XP, _event, 0)
-#define CMN_EVENT_SBSX(_name, _event)				\
-	CMN_EVENT_ATTR(sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
-#define CMN_EVENT_RNID(_name, _event)				\
-	CMN_EVENT_ATTR(rnid_##_name, CMN_TYPE_RNI, _event, 0)
-
-#define CMN_EVENT_DVM(_name, _event)				\
-	_CMN_EVENT_DVM(_name, _event, 0)
-#define CMN_EVENT_HNF(_name, _event)				\
-	_CMN_EVENT_HNF(_name, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event, 0)
+#define CMN_EVENT_SBSX(_model, _name, _event)			\
+	CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
+#define CMN_EVENT_RNID(_model, _name, _event)			\
+	CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event, 0)
+#define CMN_EVENT_MTSX(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event, 0)
+
+#define CMN_EVENT_DVM(_model, _name, _event)			\
+	_CMN_EVENT_DVM(_model, _name, _event, 0)
+#define CMN_EVENT_HNF(_model, _name, _event)			\
+	_CMN_EVENT_HNF(_model, _name, _event, 0)
 #define _CMN_EVENT_XP(_name, _event)				\
 	__CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)),		\
 	__CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)),		\
 	__CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)),		\
 	__CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)),		\
 	__CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)),	\
-	__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2))
+	__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)),	\
+	__CMN_EVENT_XP(p2_##_name, (_event) | (6 << 2)),	\
+	__CMN_EVENT_XP(p3_##_name, (_event) | (7 << 2))
 
 /* Good thing there are only 3 fundamental XP events... */
 #define CMN_EVENT_XP(_name, _event)				\
 	_CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)),	\
 	_CMN_EVENT_XP(rsp_##_name, (_event) | (1 << 5)),	\
 	_CMN_EVENT_XP(snp_##_name, (_event) | (2 << 5)),	\
-	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5))
+	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)),	\
+	_CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)),	\
+	_CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)),	\
+	_CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5))
 
 
 static struct attribute *arm_cmn_event_attrs[] = {
@@ -434,115 +662,152 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	 * slot, but our lazy short-cut of using the DTM counter index for
 	 * the PMU index as well happens to avoid that by construction.
 	 */
-	CMN_EVENT_DVM(rxreq_dvmop,	0x01),
-	CMN_EVENT_DVM(rxreq_dvmsync,	0x02),
-	CMN_EVENT_DVM(rxreq_dvmop_vmid_filtered, 0x03),
-	CMN_EVENT_DVM(rxreq_retried,	0x04),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_all, 0x05, 0),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_dvmop, 0x05, 1),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_dvmsync, 0x05, 2),
-
-	CMN_EVENT_HNF(cache_miss,	0x01),
-	CMN_EVENT_HNF(slc_sf_cache_access, 0x02),
-	CMN_EVENT_HNF(cache_fill,	0x03),
-	CMN_EVENT_HNF(pocq_retry,	0x04),
-	CMN_EVENT_HNF(pocq_reqs_recvd,	0x05),
-	CMN_EVENT_HNF(sf_hit,		0x06),
-	CMN_EVENT_HNF(sf_evictions,	0x07),
-	CMN_EVENT_HNF(dir_snoops_sent,	0x08),
-	CMN_EVENT_HNF(brd_snoops_sent,	0x09),
-	CMN_EVENT_HNF(slc_eviction,	0x0a),
-	CMN_EVENT_HNF(slc_fill_invalid_way, 0x0b),
-	CMN_EVENT_HNF(mc_retries,	0x0c),
-	CMN_EVENT_HNF(mc_reqs,		0x0d),
-	CMN_EVENT_HNF(qos_hh_retry,	0x0e),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_all, 0x0f, 0),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_read, 0x0f, 1),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_write, 0x0f, 2),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_atomic, 0x0f, 3),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_stash, 0x0f, 4),
-	CMN_EVENT_HNF(pocq_addrhaz,	0x10),
-	CMN_EVENT_HNF(pocq_atomic_addrhaz, 0x11),
-	CMN_EVENT_HNF(ld_st_swp_adq_full, 0x12),
-	CMN_EVENT_HNF(cmp_adq_full,	0x13),
-	CMN_EVENT_HNF(txdat_stall,	0x14),
-	CMN_EVENT_HNF(txrsp_stall,	0x15),
-	CMN_EVENT_HNF(seq_full,		0x16),
-	CMN_EVENT_HNF(seq_hit,		0x17),
-	CMN_EVENT_HNF(snp_sent,		0x18),
-	CMN_EVENT_HNF(sfbi_dir_snp_sent, 0x19),
-	CMN_EVENT_HNF(sfbi_brd_snp_sent, 0x1a),
-	CMN_EVENT_HNF(snp_sent_untrk,	0x1b),
-	CMN_EVENT_HNF(intv_dirty,	0x1c),
-	CMN_EVENT_HNF(stash_snp_sent,	0x1d),
-	CMN_EVENT_HNF(stash_data_pull,	0x1e),
-	CMN_EVENT_HNF(snp_fwded,	0x1f),
-
-	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20),
-	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21),
-	CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22),
-	CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23),
-	CMN_EVENT_HNI(wdb_occ_cnt_ovfl,	0x24),
-	CMN_EVENT_HNI(rrt_rd_alloc,	0x25),
-	CMN_EVENT_HNI(rrt_wr_alloc,	0x26),
-	CMN_EVENT_HNI(rdt_rd_alloc,	0x27),
-	CMN_EVENT_HNI(rdt_wr_alloc,	0x28),
-	CMN_EVENT_HNI(wdb_alloc,	0x29),
-	CMN_EVENT_HNI(txrsp_retryack,	0x2a),
-	CMN_EVENT_HNI(arvalid_no_arready, 0x2b),
-	CMN_EVENT_HNI(arready_no_arvalid, 0x2c),
-	CMN_EVENT_HNI(awvalid_no_awready, 0x2d),
-	CMN_EVENT_HNI(awready_no_awvalid, 0x2e),
-	CMN_EVENT_HNI(wvalid_no_wready,	0x2f),
-	CMN_EVENT_HNI(txdat_stall,	0x30),
-	CMN_EVENT_HNI(nonpcie_serialization, 0x31),
-	CMN_EVENT_HNI(pcie_serialization, 0x32),
-
-	CMN_EVENT_XP(txflit_valid,	0x01),
-	CMN_EVENT_XP(txflit_stall,	0x02),
-	CMN_EVENT_XP(partial_dat_flit,	0x03),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmop,		0x01),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmsync,		0x02),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmop_vmid_filtered, 0x03),
+	CMN_EVENT_DVM(CMN600, rxreq_retried,		0x04),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
+	CMN_EVENT_DVM(CI700, dvmop_tlbi,		0x01),
+	CMN_EVENT_DVM(CI700, dvmop_bpi,			0x02),
+	CMN_EVENT_DVM(CI700, dvmop_pici,		0x03),
+	CMN_EVENT_DVM(CI700, dvmop_vici,		0x04),
+	CMN_EVENT_DVM(CI700, dvmsync,			0x05),
+	CMN_EVENT_DVM(CI700, vmid_filtered,		0x06),
+	CMN_EVENT_DVM(CI700, rndop_filtered,		0x07),
+	CMN_EVENT_DVM(CI700, retry,			0x08),
+	CMN_EVENT_DVM(CI700, txsnp_flitv,		0x09),
+	CMN_EVENT_DVM(CI700, txsnp_stall,		0x0a),
+	CMN_EVENT_DVM(CI700, trkfull,			0x0b),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_all,	0x0c, 0),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmop,	0x0c, 1),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmsync,	0x0c, 2),
+
+	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
+	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
+	CMN_EVENT_HNF(CMN_ANY, cache_fill,		0x03),
+	CMN_EVENT_HNF(CMN_ANY, pocq_retry,		0x04),
+	CMN_EVENT_HNF(CMN_ANY, pocq_reqs_recvd,		0x05),
+	CMN_EVENT_HNF(CMN_ANY, sf_hit,			0x06),
+	CMN_EVENT_HNF(CMN_ANY, sf_evictions,		0x07),
+	CMN_EVENT_HNF(CMN_ANY, dir_snoops_sent,		0x08),
+	CMN_EVENT_HNF(CMN_ANY, brd_snoops_sent,		0x09),
+	CMN_EVENT_HNF(CMN_ANY, slc_eviction,		0x0a),
+	CMN_EVENT_HNF(CMN_ANY, slc_fill_invalid_way,	0x0b),
+	CMN_EVENT_HNF(CMN_ANY, mc_retries,		0x0c),
+	CMN_EVENT_HNF(CMN_ANY, mc_reqs,			0x0d),
+	CMN_EVENT_HNF(CMN_ANY, qos_hh_retry,		0x0e),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all,	0x0f, 0),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4),
+	CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz,		0x10),
+	CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz,	0x11),
+	CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full,	0x12),
+	CMN_EVENT_HNF(CMN_ANY, cmp_adq_full,		0x13),
+	CMN_EVENT_HNF(CMN_ANY, txdat_stall,		0x14),
+	CMN_EVENT_HNF(CMN_ANY, txrsp_stall,		0x15),
+	CMN_EVENT_HNF(CMN_ANY, seq_full,		0x16),
+	CMN_EVENT_HNF(CMN_ANY, seq_hit,			0x17),
+	CMN_EVENT_HNF(CMN_ANY, snp_sent,		0x18),
+	CMN_EVENT_HNF(CMN_ANY, sfbi_dir_snp_sent,	0x19),
+	CMN_EVENT_HNF(CMN_ANY, sfbi_brd_snp_sent,	0x1a),
+	CMN_EVENT_HNF(CMN_ANY, snp_sent_untrk,		0x1b),
+	CMN_EVENT_HNF(CMN_ANY, intv_dirty,		0x1c),
+	CMN_EVENT_HNF(CMN_ANY, stash_snp_sent,		0x1d),
+	CMN_EVENT_HNF(CMN_ANY, stash_data_pull,		0x1e),
+	CMN_EVENT_HNF(CMN_ANY, snp_fwded,		0x1f),
+	CMN_EVENT_HNF(CI700, atomic_fwd,		0x20),
+	CMN_EVENT_HNF(CI700, mpam_hardlim,		0x21),
+	CMN_EVENT_HNF(CI700, mpam_softlim,		0x22),
+
+	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
+	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
+	CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl,		0x22),
+	CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl,		0x23),
+	CMN_EVENT_HNI(wdb_occ_cnt_ovfl,			0x24),
+	CMN_EVENT_HNI(rrt_rd_alloc,			0x25),
+	CMN_EVENT_HNI(rrt_wr_alloc,			0x26),
+	CMN_EVENT_HNI(rdt_rd_alloc,			0x27),
+	CMN_EVENT_HNI(rdt_wr_alloc,			0x28),
+	CMN_EVENT_HNI(wdb_alloc,			0x29),
+	CMN_EVENT_HNI(txrsp_retryack,			0x2a),
+	CMN_EVENT_HNI(arvalid_no_arready,		0x2b),
+	CMN_EVENT_HNI(arready_no_arvalid,		0x2c),
+	CMN_EVENT_HNI(awvalid_no_awready,		0x2d),
+	CMN_EVENT_HNI(awready_no_awvalid,		0x2e),
+	CMN_EVENT_HNI(wvalid_no_wready,			0x2f),
+	CMN_EVENT_HNI(txdat_stall,			0x30),
+	CMN_EVENT_HNI(nonpcie_serialization,		0x31),
+	CMN_EVENT_HNI(pcie_serialization,		0x32),
+
+	CMN_EVENT_XP(txflit_valid,			0x01),
+	CMN_EVENT_XP(txflit_stall,			0x02),
+	CMN_EVENT_XP(partial_dat_flit,			0x03),
 	/* We treat watchpoints as a special made-up class of XP events */
-	CMN_EVENT_ATTR(watchpoint_up, CMN_TYPE_WP, 0, 0),
-	CMN_EVENT_ATTR(watchpoint_down, CMN_TYPE_WP, 2, 0),
-
-	CMN_EVENT_SBSX(rd_req,		0x01),
-	CMN_EVENT_SBSX(wr_req,		0x02),
-	CMN_EVENT_SBSX(cmo_req,		0x03),
-	CMN_EVENT_SBSX(txrsp_retryack,	0x04),
-	CMN_EVENT_SBSX(txdat_flitv,	0x05),
-	CMN_EVENT_SBSX(txrsp_flitv,	0x06),
-	CMN_EVENT_SBSX(rd_req_trkr_occ_cnt_ovfl, 0x11),
-	CMN_EVENT_SBSX(wr_req_trkr_occ_cnt_ovfl, 0x12),
-	CMN_EVENT_SBSX(cmo_req_trkr_occ_cnt_ovfl, 0x13),
-	CMN_EVENT_SBSX(wdb_occ_cnt_ovfl, 0x14),
-	CMN_EVENT_SBSX(rd_axi_trkr_occ_cnt_ovfl, 0x15),
-	CMN_EVENT_SBSX(cmo_axi_trkr_occ_cnt_ovfl, 0x16),
-	CMN_EVENT_SBSX(arvalid_no_arready, 0x21),
-	CMN_EVENT_SBSX(awvalid_no_awready, 0x22),
-	CMN_EVENT_SBSX(wvalid_no_wready, 0x23),
-	CMN_EVENT_SBSX(txdat_stall,	0x24),
-	CMN_EVENT_SBSX(txrsp_stall,	0x25),
-
-	CMN_EVENT_RNID(s0_rdata_beats,	0x01),
-	CMN_EVENT_RNID(s1_rdata_beats,	0x02),
-	CMN_EVENT_RNID(s2_rdata_beats,	0x03),
-	CMN_EVENT_RNID(rxdat_flits,	0x04),
-	CMN_EVENT_RNID(txdat_flits,	0x05),
-	CMN_EVENT_RNID(txreq_flits_total, 0x06),
-	CMN_EVENT_RNID(txreq_flits_retried, 0x07),
-	CMN_EVENT_RNID(rrt_occ_ovfl,	0x08),
-	CMN_EVENT_RNID(wrt_occ_ovfl,	0x09),
-	CMN_EVENT_RNID(txreq_flits_replayed, 0x0a),
-	CMN_EVENT_RNID(wrcancel_sent,	0x0b),
-	CMN_EVENT_RNID(s0_wdata_beats,	0x0c),
-	CMN_EVENT_RNID(s1_wdata_beats,	0x0d),
-	CMN_EVENT_RNID(s2_wdata_beats,	0x0e),
-	CMN_EVENT_RNID(rrt_alloc,	0x0f),
-	CMN_EVENT_RNID(wrt_alloc,	0x10),
-	CMN_EVENT_RNID(rdb_unord,	0x11),
-	CMN_EVENT_RNID(rdb_replay,	0x12),
-	CMN_EVENT_RNID(rdb_hybrid,	0x13),
-	CMN_EVENT_RNID(rdb_ord,		0x14),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP, 0),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN, 0),
+
+	CMN_EVENT_SBSX(CMN_ANY, rd_req,			0x01),
+	CMN_EVENT_SBSX(CMN_ANY, wr_req,			0x02),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_req,		0x03),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_retryack,		0x04),
+	CMN_EVENT_SBSX(CMN_ANY, txdat_flitv,		0x05),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_flitv,		0x06),
+	CMN_EVENT_SBSX(CMN_ANY, rd_req_trkr_occ_cnt_ovfl, 0x11),
+	CMN_EVENT_SBSX(CMN_ANY, wr_req_trkr_occ_cnt_ovfl, 0x12),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_req_trkr_occ_cnt_ovfl, 0x13),
+	CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl,	0x14),
+	CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16),
+	CMN_EVENT_SBSX(CI700, rdb_occ_cnt_ovfl,		0x17),
+	CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready,	0x21),
+	CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready,	0x22),
+	CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready,	0x23),
+	CMN_EVENT_SBSX(CMN_ANY, txdat_stall,		0x24),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_stall,		0x25),
+
+	CMN_EVENT_RNID(CMN_ANY, s0_rdata_beats,		0x01),
+	CMN_EVENT_RNID(CMN_ANY, s1_rdata_beats,		0x02),
+	CMN_EVENT_RNID(CMN_ANY, s2_rdata_beats,		0x03),
+	CMN_EVENT_RNID(CMN_ANY, rxdat_flits,		0x04),
+	CMN_EVENT_RNID(CMN_ANY, txdat_flits,		0x05),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_total,	0x06),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_retried,	0x07),
+	CMN_EVENT_RNID(CMN_ANY, rrt_occ_ovfl,		0x08),
+	CMN_EVENT_RNID(CMN_ANY, wrt_occ_ovfl,		0x09),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_replayed,	0x0a),
+	CMN_EVENT_RNID(CMN_ANY, wrcancel_sent,		0x0b),
+	CMN_EVENT_RNID(CMN_ANY, s0_wdata_beats,		0x0c),
+	CMN_EVENT_RNID(CMN_ANY, s1_wdata_beats,		0x0d),
+	CMN_EVENT_RNID(CMN_ANY, s2_wdata_beats,		0x0e),
+	CMN_EVENT_RNID(CMN_ANY, rrt_alloc,		0x0f),
+	CMN_EVENT_RNID(CMN_ANY, wrt_alloc,		0x10),
+	CMN_EVENT_RNID(CMN600, rdb_unord,		0x11),
+	CMN_EVENT_RNID(CMN600, rdb_replay,		0x12),
+	CMN_EVENT_RNID(CMN600, rdb_hybrid,		0x13),
+	CMN_EVENT_RNID(CMN600, rdb_ord,			0x14),
+	CMN_EVENT_RNID(CI700, padb_occ_ovfl,		0x11),
+	CMN_EVENT_RNID(CI700, rpdb_occ_ovfl,		0x12),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice1,	0x13),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice2,	0x14),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice3,	0x15),
+	CMN_EVENT_RNID(CI700, wrt_throttled,		0x16),
+
+	CMN_EVENT_MTSX(tc_lookup,			0x01),
+	CMN_EVENT_MTSX(tc_fill,				0x02),
+	CMN_EVENT_MTSX(tc_miss,				0x03),
+	CMN_EVENT_MTSX(tdb_forward,			0x04),
+	CMN_EVENT_MTSX(tcq_hazard,			0x05),
+	CMN_EVENT_MTSX(tcq_rd_alloc,			0x06),
+	CMN_EVENT_MTSX(tcq_wr_alloc,			0x07),
+	CMN_EVENT_MTSX(tcq_cmo_alloc,			0x08),
+	CMN_EVENT_MTSX(axi_rd_req,			0x09),
+	CMN_EVENT_MTSX(axi_wr_req,			0x0a),
+	CMN_EVENT_MTSX(tcq_occ_cnt_ovfl,		0x0b),
+	CMN_EVENT_MTSX(tdb_occ_cnt_ovfl,		0x0c),
 
 	NULL
 };
@@ -644,7 +909,8 @@ static u32 arm_cmn_wp_config(struct perf_event *event)
 	config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) |
-		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc);
+		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc) |
+		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL2, dev >> 1);
 	if (combine && !grp)
 		config |= CMN_DTM_WPn_CONFIG_WP_COMBINE;
 
@@ -679,18 +945,19 @@ static void arm_cmn_pmu_disable(struct pmu *pmu)
 static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 			    bool snapshot)
 {
+	struct arm_cmn_dtm *dtm = NULL;
 	struct arm_cmn_node *dn;
-	unsigned int i, offset;
-	u64 count = 0;
+	unsigned int i, offset, dtm_idx;
+	u64 reg, count = 0;
 
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn);
-		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
-		u64 reg = readq_relaxed(xp->pmu_base + offset);
-		u16 dtm_count = reg >> (dtm_idx * 16);
-
-		count += dtm_count;
+		if (dtm != &cmn->dtms[dn->dtm]) {
+			dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
+			reg = readq_relaxed(dtm->base + offset);
+		}
+		dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
+		count += (u16)(reg >> (dtm_idx * 16));
 	}
 	return count;
 }
@@ -774,8 +1041,10 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
 		u64 mask = CMN_EVENT_WP_MASK(event);
 
 		for_each_hw_dn(hw, dn, i) {
-			writeq_relaxed(val, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx));
-			writeq_relaxed(mask, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx));
+			void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+
+			writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
+			writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx));
 		}
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -800,8 +1069,10 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 		int wp_idx = arm_cmn_wp_idx(event);
 
 		for_each_hw_dn(hw, dn, i) {
-			writeq_relaxed(0, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx));
-			writeq_relaxed(~0ULL, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx));
+			void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+
+			writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
+			writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx));
 		}
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -814,14 +1085,15 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 }
 
 struct arm_cmn_val {
-	u8 dtm_count[CMN_MAX_XPS];
-	u8 occupid[CMN_MAX_XPS];
-	u8 wp[CMN_MAX_XPS][4];
+	u8 dtm_count[CMN_MAX_DTMS];
+	u8 occupid[CMN_MAX_DTMS];
+	u8 wp[CMN_MAX_DTMS][4];
 	int dtc_count;
 	bool cycles;
 };
 
-static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *event)
+static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
+				  struct perf_event *event)
 {
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	struct arm_cmn_node *dn;
@@ -839,33 +1111,33 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 	}
 
 	val->dtc_count++;
-	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
 	else
 		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, xp = arm_cmn_node_to_xp(dn)->logid;
+		int wp_idx, dtm = dn->dtm;
 
-		val->dtm_count[xp]++;
-		val->occupid[xp] = occupid;
+		val->dtm_count[dtm]++;
+		val->occupid[dtm] = occupid;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		val->wp[xp][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
+		val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
 	}
 }
 
-static int arm_cmn_validate_group(struct perf_event *event)
+static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 {
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	struct arm_cmn_node *dn;
 	struct perf_event *sibling, *leader = event->group_leader;
 	enum cmn_node_type type;
-	struct arm_cmn_val val;
-	int i;
+	struct arm_cmn_val *val;
+	int i, ret = -EINVAL;
 	u8 occupid;
 
 	if (leader == event)
@@ -874,54 +1146,61 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (event->pmu != leader->pmu && !is_software_event(leader))
 		return -EINVAL;
 
-	memset(&val, 0, sizeof(val));
+	val = kzalloc(sizeof(*val), GFP_KERNEL);
+	if (!val)
+		return -ENOMEM;
 
-	arm_cmn_val_add_event(&val, leader);
+	arm_cmn_val_add_event(cmn, val, leader);
 	for_each_sibling_event(sibling, leader)
-		arm_cmn_val_add_event(&val, sibling);
+		arm_cmn_val_add_event(cmn, val, sibling);
 
 	type = CMN_EVENT_TYPE(event);
-	if (type == CMN_TYPE_DTC)
-		return val.cycles ? -EINVAL : 0;
+	if (type == CMN_TYPE_DTC) {
+		ret = val->cycles ? -EINVAL : 0;
+		goto done;
+	}
 
-	if (val.dtc_count == CMN_DT_NUM_COUNTERS)
-		return -EINVAL;
+	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
+		goto done;
 
-	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
 	else
 		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, wp_cmb, xp = arm_cmn_node_to_xp(dn)->logid;
+		int wp_idx, wp_cmb, dtm = dn->dtm;
 
-		if (val.dtm_count[xp] == CMN_DTM_NUM_COUNTERS)
-			return -EINVAL;
+		if (val->dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
+			goto done;
 
-		if (occupid && val.occupid[xp] && occupid != val.occupid[xp])
-			return -EINVAL;
+		if (occupid && val->occupid[dtm] && occupid != val->occupid[dtm])
+			goto done;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		if (val.wp[xp][wp_idx])
-			return -EINVAL;
+		if (val->wp[dtm][wp_idx])
+			goto done;
 
-		wp_cmb = val.wp[xp][wp_idx ^ 1];
+		wp_cmb = val->wp[dtm][wp_idx ^ 1];
 		if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1)
-			return -EINVAL;
+			goto done;
 	}
 
-	return 0;
+	ret = 0;
+done:
+	kfree(val);
+	return ret;
 }
 
 static int arm_cmn_event_init(struct perf_event *event)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+	struct arm_cmn_node *dn;
 	enum cmn_node_type type;
-	unsigned int i;
 	bool bynodeid;
 	u16 nodeid, eventid;
 
@@ -947,38 +1226,37 @@ static int arm_cmn_event_init(struct perf_event *event)
 		eventid = CMN_EVENT_EVENTID(event);
 		if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN)
 			return -EINVAL;
+		/* ...but the DTM may depend on which port we're watching */
+		if (cmn->multi_dtm)
+			hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2;
 	}
 
 	bynodeid = CMN_EVENT_BYNODEID(event);
 	nodeid = CMN_EVENT_NODEID(event);
 
 	hw->dn = arm_cmn_node(cmn, type);
-	for (i = hw->dn - cmn->dns; i < cmn->num_dns && cmn->dns[i].type == type; i++) {
-		if (!bynodeid) {
-			hw->num_dns++;
-		} else if (cmn->dns[i].id != nodeid) {
+	if (!hw->dn)
+		return -EINVAL;
+	for (dn = hw->dn; dn->type == type; dn++) {
+		if (bynodeid && dn->id != nodeid) {
 			hw->dn++;
-		} else {
-			hw->num_dns = 1;
-			break;
+			continue;
 		}
+		hw->dtcs_used |= arm_cmn_node_to_xp(cmn, dn)->dtc;
+		hw->num_dns++;
+		if (bynodeid)
+			break;
 	}
 
 	if (!hw->num_dns) {
-		int bits = arm_cmn_xyidbits(cmn);
+		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, nodeid);
 
 		dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n",
-			nodeid, CMN_NODEID_X(nodeid, bits), CMN_NODEID_Y(nodeid, bits),
-			CMN_NODEID_PID(nodeid), CMN_NODEID_DEVID(nodeid), type);
+			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
-	/*
-	 * By assuming events count in all DTC domains, we cunningly avoid
-	 * needing to know anything about how XPs are assigned to domains.
-	 */
-	hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
 
-	return arm_cmn_validate_group(event);
+	return arm_cmn_validate_group(cmn, event);
 }
 
 static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
@@ -988,17 +1266,17 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
 
 	while (i--) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(hw->dn + i);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset;
 		unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
 		if (type == CMN_TYPE_WP)
-			hw->dn[i].wp_event[arm_cmn_wp_idx(event)] = -1;
+			dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
 
-		if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+		if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 			hw->dn[i].occupid_count--;
 
-		xp->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
-		writel_relaxed(xp->pmu_config_low, xp->pmu_base + CMN_DTM_PMU_CONFIG);
+		dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
+		writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
 	}
 	memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
 
@@ -1040,12 +1318,12 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 	/* ...then the local counters to feed it. */
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
 		unsigned int dtm_idx, shift;
 		u64 reg;
 
 		dtm_idx = 0;
-		while (xp->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx))
+		while (dtm->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx))
 			if (++dtm_idx == CMN_DTM_NUM_COUNTERS)
 				goto free_dtms;
 
@@ -1055,26 +1333,28 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 			int tmp, wp_idx = arm_cmn_wp_idx(event);
 			u32 cfg = arm_cmn_wp_config(event);
 
-			if (dn->wp_event[wp_idx] >= 0)
+			if (dtm->wp_event[wp_idx] >= 0)
 				goto free_dtms;
 
-			tmp = dn->wp_event[wp_idx ^ 1];
+			tmp = dtm->wp_event[wp_idx ^ 1];
 			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
 					CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
 				goto free_dtms;
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
-			dn->wp_event[wp_idx] = dtc_idx;
-			writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx));
+			dtm->wp_event[wp_idx] = dtc_idx;
+			writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
 		} else {
-			unsigned int port = CMN_NODEID_PID(dn->id);
-			unsigned int dev = CMN_NODEID_DEVID(dn->id);
+			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+
+			if (cmn->multi_dtm)
+				nid.port %= 2;
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx +
-				    (port << 4) + (dev << 2);
+				    (nid.port << 4) + (nid.dev << 2);
 
-			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
-				int occupid = CMN_EVENT_OCCUPID(event);
+			if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) {
+				u8 occupid = CMN_EVENT_OCCUPID(event);
 
 				if (dn->occupid_count == 0) {
 					dn->occupid_val = occupid;
@@ -1089,13 +1369,13 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 		arm_cmn_set_index(hw->dtm_idx, i, dtm_idx);
 
-		xp->input_sel[dtm_idx] = input_sel;
+		dtm->input_sel[dtm_idx] = input_sel;
 		shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx);
-		xp->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
-		xp->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
-		xp->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
-		reg = (u64)le32_to_cpu(xp->pmu_config_high) << 32 | xp->pmu_config_low;
-		writeq_relaxed(reg, xp->pmu_base + CMN_DTM_PMU_CONFIG);
+		dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
+		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
+		dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
+		reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low;
+		writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG);
 	}
 
 	/* Go go go! */
@@ -1147,23 +1427,47 @@ static int arm_cmn_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+static void arm_cmn_migrate(struct arm_cmn *cmn, unsigned int cpu)
+{
+	unsigned int i;
+
+	perf_pmu_migrate_context(&cmn->pmu, cmn->cpu, cpu);
+	for (i = 0; i < cmn->num_dtcs; i++)
+		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(cpu));
+	cmn->cpu = cpu;
+}
+
+static int arm_cmn_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
 {
 	struct arm_cmn *cmn;
-	unsigned int i, target;
+	int node;
 
-	cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node);
-	if (cpu != cmn->cpu)
-		return 0;
+	cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node);
+	node = dev_to_node(cmn->dev);
+	if (node != NUMA_NO_NODE && cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node)
+		arm_cmn_migrate(cmn, cpu);
+	return 0;
+}
+
+static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct arm_cmn *cmn;
+	unsigned int target;
+	int node;
+	cpumask_t mask;
 
-	target = cpumask_any_but(cpu_online_mask, cpu);
-	if (target >= nr_cpu_ids)
+	cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node);
+	if (cpu != cmn->cpu)
 		return 0;
 
-	perf_pmu_migrate_context(&cmn->pmu, cpu, target);
-	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(target));
-	cmn->cpu = target;
+	node = dev_to_node(cmn->dev);
+	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
+	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
+		target = cpumask_any(&mask);
+	else
+		target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target < nr_cpu_ids)
+		arm_cmn_migrate(cmn, target);
 	return 0;
 }
 
@@ -1231,23 +1535,22 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 	return 0;
 }
 
-static void arm_cmn_init_dtm(struct arm_cmn_node *xp)
+static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, int idx)
 {
 	int i;
 
+	dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx);
+	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
 	for (i = 0; i < 4; i++) {
-		xp->wp_event[i] = -1;
-		writeq_relaxed(0, xp->pmu_base + CMN_DTM_WPn_MASK(i));
-		writeq_relaxed(~0ULL, xp->pmu_base + CMN_DTM_WPn_VAL(i));
+		dtm->wp_event[i] = -1;
+		writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i));
+		writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i));
 	}
-	xp->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
-	xp->dtc = -1;
 }
 
 static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx)
 {
 	struct arm_cmn_dtc *dtc = cmn->dtc + idx;
-	struct arm_cmn_node *xp;
 
 	dtc->base = dn->pmu_base - CMN_PMU_OFFSET;
 	dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx);
@@ -1258,10 +1561,6 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id
 	writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR);
 	writel_relaxed(CMN_DT_PMCR_OVFL_INTR_EN, dtc->base + CMN_DT_PMCR);
 
-	/* We do at least know that a DTC's XP must be in that DTC's domain */
-	xp = arm_cmn_node_to_xp(dn);
-	xp->dtc = idx;
-
 	return 0;
 }
 
@@ -1278,8 +1577,9 @@ static int arm_cmn_node_cmp(const void *a, const void *b)
 
 static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 {
-	struct arm_cmn_node *dn;
+	struct arm_cmn_node *dn, *xp;
 	int dtc_idx = 0;
+	u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
 
 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
 	if (!cmn->dtc)
@@ -1289,14 +1589,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
 
-	for (dn = cmn->dns; dn < cmn->dns + cmn->num_dns; dn++) {
-		if (dn->type != CMN_TYPE_XP)
-			arm_cmn_init_node_to_xp(cmn, dn);
-		else if (cmn->num_dtcs == 1)
-			dn->dtc = 0;
+	for (dn = cmn->dns; dn->type; dn++) {
+		if (dn->type == CMN_TYPE_XP) {
+			dn->dtc &= dtcs_present;
+			continue;
+		}
 
-		if (dn->type == CMN_TYPE_DTC)
-			arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+		xp = arm_cmn_node_to_xp(cmn, dn);
+		dn->dtm = xp->dtm;
+		if (cmn->multi_dtm)
+			dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
+
+		if (dn->type == CMN_TYPE_DTC) {
+			int err;
+			/* We do at least know that a DTC's XP must be in that DTC's domain */
+			if (xp->dtc == 0xf)
+				xp->dtc = 1 << dtc_idx;
+			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+			if (err)
+				return err;
+		}
 
 		/* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */
 		if (dn->type == CMN_TYPE_RND)
@@ -1335,19 +1647,25 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 {
 	void __iomem *cfg_region;
 	struct arm_cmn_node cfg, *dn;
+	struct arm_cmn_dtm *dtm;
 	u16 child_count, child_poff;
 	u32 xp_offset[CMN_MAX_XPS];
 	u64 reg;
 	int i, j;
+	size_t sz;
+
+	arm_cmn_init_node_info(cmn, rgn_offset, &cfg);
+	if (cfg.type != CMN_TYPE_CFG)
+		return -ENODEV;
 
 	cfg_region = cmn->base + rgn_offset;
 	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
 	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
-	dev_dbg(cmn->dev, "periph_id_2 revision: %d\n", cmn->rev);
 
-	arm_cmn_init_node_info(cmn, rgn_offset, &cfg);
-	if (cfg.type != CMN_TYPE_CFG)
-		return -ENODEV;
+	reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL);
+	cmn->multi_dtm = reg & CMN_INFO_MULTIPLE_DTM_EN;
+	cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg);
+	cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg);
 
 	reg = readq_relaxed(cfg_region + CMN_CHILD_INFO);
 	child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
@@ -1365,20 +1683,28 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	}
 
-	/* Cheeky +1 to help terminate pointer-based iteration */
-	cmn->dns = devm_kcalloc(cmn->dev, cmn->num_dns + 1,
-				sizeof(*cmn->dns), GFP_KERNEL);
-	if (!cmn->dns)
+	/* Cheeky +1 to help terminate pointer-based iteration later */
+	dn = devm_kcalloc(cmn->dev, cmn->num_dns + 1, sizeof(*dn), GFP_KERNEL);
+	if (!dn)
+		return -ENOMEM;
+
+	/* Initial safe upper bound on DTMs for any possible mesh layout */
+	i = cmn->num_xps;
+	if (cmn->multi_dtm)
+		i += cmn->num_xps + 1;
+	dtm = devm_kcalloc(cmn->dev, i, sizeof(*dtm), GFP_KERNEL);
+	if (!dtm)
 		return -ENOMEM;
 
 	/* Pass 2: now we can actually populate the nodes */
-	dn = cmn->dns;
+	cmn->dns = dn;
+	cmn->dtms = dtm;
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
+		unsigned int xp_ports = 0;
 
 		arm_cmn_init_node_info(cmn, xp_offset[i], xp);
-		arm_cmn_init_dtm(xp);
 		/*
 		 * Thanks to the order in which XP logical IDs seem to be
 		 * assigned, we can handily infer the mesh X dimension by
@@ -1388,6 +1714,40 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
+		if (cmn->model == CMN600)
+			xp->dtc = 0xf;
+		else
+			xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO);
+
+		xp->dtm = dtm - cmn->dtms;
+		arm_cmn_init_dtm(dtm++, xp, 0);
+		/*
+		 * Keeping track of connected ports will let us filter out
+		 * unnecessary XP events easily. We can also reliably infer the
+		 * "extra device ports" configuration for the node ID format
+		 * from this, since in that case we will see at least one XP
+		 * with port 2 connected, for the HN-D.
+		 */
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P0))
+			xp_ports |= BIT(0);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P1))
+			xp_ports |= BIT(1);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P2))
+			xp_ports |= BIT(2);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P3))
+			xp_ports |= BIT(3);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P4))
+			xp_ports |= BIT(4);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P5))
+			xp_ports |= BIT(5);
+
+		if (cmn->multi_dtm && (xp_ports & 0xc))
+			arm_cmn_init_dtm(dtm++, xp, 1);
+		if (cmn->multi_dtm && (xp_ports & 0x30))
+			arm_cmn_init_dtm(dtm++, xp, 2);
+
+		cmn->ports_used |= xp_ports;
+
 		reg = readq_relaxed(xp_region + CMN_CHILD_INFO);
 		child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 		child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);
@@ -1422,11 +1782,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			case CMN_TYPE_SBSX:
 			case CMN_TYPE_RNI:
 			case CMN_TYPE_RND:
+			case CMN_TYPE_MTSX:
 			case CMN_TYPE_CXRA:
 			case CMN_TYPE_CXHA:
 				dn++;
 				break;
 			/* Nothing to see here */
+			case CMN_TYPE_MPAM_S:
+			case CMN_TYPE_MPAM_NS:
 			case CMN_TYPE_RNSAM:
 			case CMN_TYPE_CXLA:
 				break;
@@ -1441,6 +1804,16 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	/* Correct for any nodes we skipped */
 	cmn->num_dns = dn - cmn->dns;
 
+	sz = (void *)(dn + 1) - (void *)cmn->dns;
+	dn = devm_krealloc(cmn->dev, cmn->dns, sz, GFP_KERNEL);
+	if (dn)
+		cmn->dns = dn;
+
+	sz = (void *)dtm - (void *)cmn->dtms;
+	dtm = devm_krealloc(cmn->dev, cmn->dtms, sz, GFP_KERNEL);
+	if (dtm)
+		cmn->dtms = dtm;
+
 	/*
 	 * If mesh_x wasn't set during discovery then we never saw
 	 * an XP at (0,1), thus we must have an Nx1 configuration.
@@ -1449,13 +1822,20 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->mesh_x = cmn->num_xps;
 	cmn->mesh_y = cmn->num_xps / cmn->mesh_x;
 
-	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n",
-		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn));
+	/* 1x1 config plays havoc with XP event encodings */
+	if (cmn->num_xps == 1)
+		dev_warn(cmn->dev, "1x1 config not fully supported, translate XP events manually\n");
+
+	dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev);
+	reg = cmn->ports_used;
+	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d, ports %6pbl%s\n",
+		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn), &reg,
+		cmn->multi_dtm ? ", multi-DTM" : "");
 
 	return 0;
 }
 
-static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
 {
 	struct resource *cfg, *root;
 
@@ -1482,21 +1862,11 @@ static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
 	return root->start - cfg->start;
 }
 
-static int arm_cmn_of_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn600_of_probe(struct device_node *np)
 {
-	struct device_node *np = pdev->dev.of_node;
 	u32 rootnode;
-	int ret;
-
-	cmn->base = devm_platform_ioremap_resource(pdev, 0);
-	if (IS_ERR(cmn->base))
-		return PTR_ERR(cmn->base);
 
-	ret = of_property_read_u32(np, "arm,root-node", &rootnode);
-	if (ret)
-		return ret;
-
-	return rootnode;
+	return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
 }
 
 static int arm_cmn_probe(struct platform_device *pdev)
@@ -1504,19 +1874,26 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	struct arm_cmn *cmn;
 	const char *name;
 	static atomic_t id;
-	int err, rootnode;
+	int err, rootnode, this_id;
 
 	cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL);
 	if (!cmn)
 		return -ENOMEM;
 
 	cmn->dev = &pdev->dev;
+	cmn->model = (unsigned long)device_get_match_data(cmn->dev);
 	platform_set_drvdata(pdev, cmn);
 
-	if (has_acpi_companion(cmn->dev))
-		rootnode = arm_cmn_acpi_probe(pdev, cmn);
-	else
-		rootnode = arm_cmn_of_probe(pdev, cmn);
+	if (cmn->model == CMN600 && has_acpi_companion(cmn->dev)) {
+		rootnode = arm_cmn600_acpi_probe(pdev, cmn);
+	} else {
+		rootnode = 0;
+		cmn->base = devm_platform_ioremap_resource(pdev, 0);
+		if (IS_ERR(cmn->base))
+			return PTR_ERR(cmn->base);
+		if (cmn->model == CMN600)
+			rootnode = arm_cmn600_of_probe(pdev->dev.of_node);
+	}
 	if (rootnode < 0)
 		return rootnode;
 
@@ -1532,7 +1909,7 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	cmn->cpu = raw_smp_processor_id();
+	cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev));
 	cmn->pmu = (struct pmu) {
 		.module = THIS_MODULE,
 		.attr_groups = arm_cmn_attr_groups,
@@ -1551,7 +1928,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 		.cancel_txn = arm_cmn_end_txn,
 	};
 
-	name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", atomic_fetch_inc(&id));
+	this_id = atomic_fetch_inc(&id);
+	name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id);
 	if (!name)
 		return -ENOMEM;
 
@@ -1561,7 +1939,10 @@ static int arm_cmn_probe(struct platform_device *pdev)
 
 	err = perf_pmu_register(&cmn->pmu, name, -1);
 	if (err)
-		cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
+		cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+	else
+		arm_cmn_debugfs_init(cmn, this_id);
+
 	return err;
 }
 
@@ -1572,13 +1953,15 @@ static int arm_cmn_remove(struct platform_device *pdev)
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
 
 	perf_pmu_unregister(&cmn->pmu);
-	cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
+	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+	debugfs_remove(cmn->debug);
 	return 0;
 }
 
 #ifdef CONFIG_OF
 static const struct of_device_id arm_cmn_of_match[] = {
-	{ .compatible = "arm,cmn-600", },
+	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
+	{ .compatible = "arm,ci-700", .data = (void *)CI700 },
 	{}
 };
 MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
@@ -1586,7 +1969,7 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
 
 #ifdef CONFIG_ACPI
 static const struct acpi_device_id arm_cmn_acpi_match[] = {
-	{ "ARMHC600", },
+	{ "ARMHC600", CMN600 },
 	{}
 };
 MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match);
@@ -1607,15 +1990,20 @@ static int __init arm_cmn_init(void)
 	int ret;
 
 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
-				      "perf/arm/cmn:online", NULL,
+				      "perf/arm/cmn:online",
+				      arm_cmn_pmu_online_cpu,
 				      arm_cmn_pmu_offline_cpu);
 	if (ret < 0)
 		return ret;
 
 	arm_cmn_hp_state = ret;
+	arm_cmn_debugfs = debugfs_create_dir("arm-cmn", NULL);
+
 	ret = platform_driver_register(&arm_cmn_driver);
-	if (ret)
+	if (ret) {
 		cpuhp_remove_multi_state(arm_cmn_hp_state);
+		debugfs_remove(arm_cmn_debugfs);
+	}
 	return ret;
 }
 
@@ -1623,6 +2011,7 @@ static void __exit arm_cmn_exit(void)
 {
 	platform_driver_unregister(&arm_cmn_driver);
 	cpuhp_remove_multi_state(arm_cmn_hp_state);
+	debugfs_remove(arm_cmn_debugfs);
 }
 
 module_init(arm_cmn_init);
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 226348822ab3..1ae19f7301b2 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -47,6 +47,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/msi.h>
+#include <linux/of.h>
 #include <linux/perf_event.h>
 #include <linux/platform_device.h>
 #include <linux/smp.h>
@@ -75,6 +76,10 @@
 #define SMMU_PMCG_CR                    0xE04
 #define SMMU_PMCG_CR_ENABLE             BIT(0)
 #define SMMU_PMCG_IIDR                  0xE08
+#define SMMU_PMCG_IIDR_PRODUCTID        GENMASK(31, 20)
+#define SMMU_PMCG_IIDR_VARIANT          GENMASK(19, 16)
+#define SMMU_PMCG_IIDR_REVISION         GENMASK(15, 12)
+#define SMMU_PMCG_IIDR_IMPLEMENTER      GENMASK(11, 0)
 #define SMMU_PMCG_CEID0                 0xE20
 #define SMMU_PMCG_CEID1                 0xE28
 #define SMMU_PMCG_IRQ_CTRL              0xE50
@@ -83,6 +88,20 @@
 #define SMMU_PMCG_IRQ_CFG1              0xE60
 #define SMMU_PMCG_IRQ_CFG2              0xE64
 
+/* IMP-DEF ID registers */
+#define SMMU_PMCG_PIDR0                 0xFE0
+#define SMMU_PMCG_PIDR0_PART_0          GENMASK(7, 0)
+#define SMMU_PMCG_PIDR1                 0xFE4
+#define SMMU_PMCG_PIDR1_DES_0           GENMASK(7, 4)
+#define SMMU_PMCG_PIDR1_PART_1          GENMASK(3, 0)
+#define SMMU_PMCG_PIDR2                 0xFE8
+#define SMMU_PMCG_PIDR2_REVISION        GENMASK(7, 4)
+#define SMMU_PMCG_PIDR2_DES_1           GENMASK(2, 0)
+#define SMMU_PMCG_PIDR3                 0xFEC
+#define SMMU_PMCG_PIDR3_REVAND          GENMASK(7, 4)
+#define SMMU_PMCG_PIDR4                 0xFD0
+#define SMMU_PMCG_PIDR4_DES_2           GENMASK(3, 0)
+
 /* MSI config fields */
 #define MSI_CFG0_ADDR_MASK              GENMASK_ULL(51, 2)
 #define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
@@ -754,6 +773,41 @@ static void smmu_pmu_get_acpi_options(struct smmu_pmu *smmu_pmu)
 	dev_notice(smmu_pmu->dev, "option mask 0x%x\n", smmu_pmu->options);
 }
 
+static bool smmu_pmu_coresight_id_regs(struct smmu_pmu *smmu_pmu)
+{
+	return of_device_is_compatible(smmu_pmu->dev->of_node,
+				       "arm,mmu-600-pmcg");
+}
+
+static void smmu_pmu_get_iidr(struct smmu_pmu *smmu_pmu)
+{
+	u32 iidr = readl_relaxed(smmu_pmu->reg_base + SMMU_PMCG_IIDR);
+
+	if (!iidr && smmu_pmu_coresight_id_regs(smmu_pmu)) {
+		u32 pidr0 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR0);
+		u32 pidr1 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR1);
+		u32 pidr2 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR2);
+		u32 pidr3 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR3);
+		u32 pidr4 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR4);
+
+		u32 productid = FIELD_GET(SMMU_PMCG_PIDR0_PART_0, pidr0) |
+				(FIELD_GET(SMMU_PMCG_PIDR1_PART_1, pidr1) << 8);
+		u32 variant = FIELD_GET(SMMU_PMCG_PIDR2_REVISION, pidr2);
+		u32 revision = FIELD_GET(SMMU_PMCG_PIDR3_REVAND, pidr3);
+		u32 implementer =
+			FIELD_GET(SMMU_PMCG_PIDR1_DES_0, pidr1) |
+			(FIELD_GET(SMMU_PMCG_PIDR2_DES_1, pidr2) << 4) |
+			(FIELD_GET(SMMU_PMCG_PIDR4_DES_2, pidr4) << 8);
+
+		iidr = FIELD_PREP(SMMU_PMCG_IIDR_PRODUCTID, productid) |
+		       FIELD_PREP(SMMU_PMCG_IIDR_VARIANT, variant) |
+		       FIELD_PREP(SMMU_PMCG_IIDR_REVISION, revision) |
+		       FIELD_PREP(SMMU_PMCG_IIDR_IMPLEMENTER, implementer);
+	}
+
+	smmu_pmu->iidr = iidr;
+}
+
 static int smmu_pmu_probe(struct platform_device *pdev)
 {
 	struct smmu_pmu *smmu_pmu;
@@ -825,7 +879,7 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	smmu_pmu->iidr = readl_relaxed(smmu_pmu->reg_base + SMMU_PMCG_IIDR);
+	smmu_pmu_get_iidr(smmu_pmu);
 
 	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "smmuv3_pmcg_%llx",
 			      (res_0->start) >> SMMU_PMCG_PA_SHIFT);
@@ -834,7 +888,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	smmu_pmu_get_acpi_options(smmu_pmu);
+	if (!dev->of_node)
+		smmu_pmu_get_acpi_options(smmu_pmu);
 
 	/* Pick one CPU to be the preferred one to use */
 	smmu_pmu->on_cpu = raw_smp_processor_id();
@@ -884,9 +939,18 @@ static void smmu_pmu_shutdown(struct platform_device *pdev)
 	smmu_pmu_disable(&smmu_pmu->pmu);
 }
 
+#ifdef CONFIG_OF
+static const struct of_device_id smmu_pmu_of_match[] = {
+	{ .compatible = "arm,smmu-v3-pmcg" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, smmu_pmu_of_match);
+#endif
+
 static struct platform_driver smmu_pmu_driver = {
 	.driver = {
 		.name = "arm-smmu-v3-pmcg",
+		.of_match_table = of_match_ptr(smmu_pmu_of_match),
 		.suppress_bind_attrs = true,
 	},
 	.probe = smmu_pmu_probe,
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig
index c5d1b7019fff..5546218b5598 100644
--- a/drivers/perf/hisilicon/Kconfig
+++ b/drivers/perf/hisilicon/Kconfig
@@ -5,3 +5,12 @@ config HISI_PMU
 	  help
 	  Support for HiSilicon SoC L3 Cache performance monitor, Hydra Home
 	  Agent performance monitor and DDR Controller performance monitor.
+
+config HISI_PCIE_PMU
+	tristate "HiSilicon PCIE PERF PMU"
+	depends on PCI && ARM64
+	help
+	  Provide support for HiSilicon PCIe performance monitoring unit (PMU)
+	  RCiEP devices.
+	  Adds the PCIe PMU into perf events system for monitoring latency,
+	  bandwidth etc.
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 7643c9f93e36..506ed39e3266 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -2,3 +2,5 @@
 obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
 			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \
 			  hisi_uncore_pa_pmu.o
+
+obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
new file mode 100644
index 000000000000..21771708597d
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This driver adds support for PCIe PMU RCiEP device. Related
+ * perf events are bandwidth, latency etc.
+ *
+ * Copyright (C) 2021 HiSilicon Limited
+ * Author: Qi Liu <liuqi115@huawei.com>
+ */
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+
+#define DRV_NAME "hisi_pcie_pmu"
+/* Define registers */
+#define HISI_PCIE_GLOBAL_CTRL		0x00
+#define HISI_PCIE_EVENT_CTRL		0x010
+#define HISI_PCIE_CNT			0x090
+#define HISI_PCIE_EXT_CNT		0x110
+#define HISI_PCIE_INT_STAT		0x150
+#define HISI_PCIE_INT_MASK		0x154
+#define HISI_PCIE_REG_BDF		0xfe0
+#define HISI_PCIE_REG_VERSION		0xfe4
+#define HISI_PCIE_REG_INFO		0xfe8
+
+/* Define command in HISI_PCIE_GLOBAL_CTRL */
+#define HISI_PCIE_GLOBAL_EN		0x01
+#define HISI_PCIE_GLOBAL_NONE		0
+
+/* Define command in HISI_PCIE_EVENT_CTRL */
+#define HISI_PCIE_EVENT_EN		BIT_ULL(20)
+#define HISI_PCIE_RESET_CNT		BIT_ULL(22)
+#define HISI_PCIE_INIT_SET		BIT_ULL(34)
+#define HISI_PCIE_THR_EN		BIT_ULL(26)
+#define HISI_PCIE_TARGET_EN		BIT_ULL(32)
+#define HISI_PCIE_TRIG_EN		BIT_ULL(52)
+
+/* Define offsets in HISI_PCIE_EVENT_CTRL */
+#define HISI_PCIE_EVENT_M		GENMASK_ULL(15, 0)
+#define HISI_PCIE_THR_MODE_M		GENMASK_ULL(27, 27)
+#define HISI_PCIE_THR_M			GENMASK_ULL(31, 28)
+#define HISI_PCIE_TARGET_M		GENMASK_ULL(52, 36)
+#define HISI_PCIE_TRIG_MODE_M		GENMASK_ULL(53, 53)
+#define HISI_PCIE_TRIG_M		GENMASK_ULL(59, 56)
+
+#define HISI_PCIE_MAX_COUNTERS		8
+#define HISI_PCIE_REG_STEP		8
+#define HISI_PCIE_THR_MAX_VAL		10
+#define HISI_PCIE_TRIG_MAX_VAL		10
+#define HISI_PCIE_MAX_PERIOD		(GENMASK_ULL(63, 0))
+#define HISI_PCIE_INIT_VAL		BIT_ULL(63)
+
+struct hisi_pcie_pmu {
+	struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS];
+	struct hlist_node node;
+	struct pci_dev *pdev;
+	struct pmu pmu;
+	void __iomem *base;
+	int irq;
+	u32 identifier;
+	/* Minimum and maximum BDF of root ports monitored by PMU */
+	u16 bdf_min;
+	u16 bdf_max;
+	int on_cpu;
+};
+
+struct hisi_pcie_reg_pair {
+	u16 lo;
+	u16 hi;
+};
+
+#define to_pcie_pmu(p)  (container_of((p), struct hisi_pcie_pmu, pmu))
+#define GET_PCI_DEVFN(bdf)  ((bdf) & 0xff)
+
+#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo)		  \
+	static u64 hisi_pcie_get_##_name(struct perf_event *event)	  \
+	{								  \
+		return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \
+	}								  \
+
+HISI_PCIE_PMU_FILTER_ATTR(event, config, 16, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4);
+HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5);
+HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9);
+HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0);
+HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16);
+
+static ssize_t hisi_pcie_format_sysfs_show(struct device *dev, struct device_attribute *attr,
+					   char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t hisi_pcie_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+					  char *buf)
+{
+	struct perf_pmu_events_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sysfs_emit(buf, "config=0x%llx\n", pmu_attr->id);
+}
+
+#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format)                              \
+	(&((struct dev_ext_attribute[]){                                       \
+		{ .attr = __ATTR(_name, 0444, hisi_pcie_format_sysfs_show,     \
+				 NULL),                                        \
+		  .var = (void *)_format }                                     \
+	})[0].attr.attr)
+
+#define HISI_PCIE_PMU_EVENT_ATTR(_name, _id)			\
+	PMU_EVENT_ATTR_ID(_name, hisi_pcie_event_sysfs_show, _id)
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static ssize_t identifier_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%#x\n", pcie_pmu->identifier);
+}
+static DEVICE_ATTR_RO(identifier);
+
+static ssize_t bus_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%#04x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
+}
+static DEVICE_ATTR_RO(bus);
+
+static struct hisi_pcie_reg_pair
+hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu, u32 reg_off)
+{
+	u32 val = readl_relaxed(pcie_pmu->base + reg_off);
+	struct hisi_pcie_reg_pair regs = {
+		.lo = val,
+		.hi = val >> 16,
+	};
+
+	return regs;
+}
+
+/*
+ * Hardware counter and ext_counter work together for bandwidth, latency, bus
+ * utilization and buffer occupancy events. For example, RX memory write latency
+ * events(index = 0x0010), counter counts total delay cycles and ext_counter
+ * counts RX memory write PCIe packets number.
+ *
+ * As we don't want PMU driver to process these two data, "delay cycles" can
+ * be treated as an independent event(index = 0x0010), "RX memory write packets
+ * number" as another(index = 0x10010). BIT 16 is used to distinguish and 0-15
+ * bits are "real" event index, which can be used to set HISI_PCIE_EVENT_CTRL.
+ */
+#define EXT_COUNTER_IS_USED(idx)		((idx) & BIT(16))
+
+static u32 hisi_pcie_get_real_event(struct perf_event *event)
+{
+	return hisi_pcie_get_event(event) & GENMASK(15, 0);
+}
+
+static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx)
+{
+	return offset + HISI_PCIE_REG_STEP * idx;
+}
+
+static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+			       u32 idx)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	return readl_relaxed(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u32 val)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	writel_relaxed(val, pcie_pmu->base + offset);
+}
+
+static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	return readq_relaxed(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u64 val)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	writeq_relaxed(val, pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_config_filter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 reg = HISI_PCIE_INIT_SET;
+	u64 port, trig_len, thr_len;
+
+	/* Config HISI_PCIE_EVENT_CTRL according to event. */
+	reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_real_event(event));
+
+	/* Config HISI_PCIE_EVENT_CTRL according to root port or EP device. */
+	port = hisi_pcie_get_port(event);
+	if (port)
+		reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port);
+	else
+		reg |= HISI_PCIE_TARGET_EN |
+		       FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event));
+
+	/* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */
+	trig_len = hisi_pcie_get_trig_len(event);
+	if (trig_len) {
+		reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len);
+		reg |= FIELD_PREP(HISI_PCIE_TRIG_MODE_M, hisi_pcie_get_trig_mode(event));
+		reg |= HISI_PCIE_TRIG_EN;
+	}
+
+	/* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */
+	thr_len = hisi_pcie_get_thr_len(event);
+	if (thr_len) {
+		reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len);
+		reg |= FIELD_PREP(HISI_PCIE_THR_MODE_M, hisi_pcie_get_thr_mode(event));
+		reg |= HISI_PCIE_THR_EN;
+	}
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg);
+}
+
+static void hisi_pcie_pmu_clear_filter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, HISI_PCIE_INIT_SET);
+}
+
+static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu, u32 bdf)
+{
+	struct pci_dev *root_port, *pdev;
+	u16 rp_bdf;
+
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus), PCI_BUS_NUM(bdf),
+					   GET_PCI_DEVFN(bdf));
+	if (!pdev)
+		return false;
+
+	root_port = pcie_find_root_port(pdev);
+	if (!root_port) {
+		pci_dev_put(pdev);
+		return false;
+	}
+
+	pci_dev_put(pdev);
+	rp_bdf = pci_dev_id(root_port);
+	return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max;
+}
+
+static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
+				       struct hisi_pcie_pmu *pcie_pmu)
+{
+	u32 requester_id = hisi_pcie_get_bdf(event);
+
+	if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL)
+		return false;
+
+	if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL)
+		return false;
+
+	if (requester_id) {
+		if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
+			return false;
+	}
+
+	return true;
+}
+
+static bool hisi_pcie_pmu_cmp_event(struct perf_event *target,
+					struct perf_event *event)
+{
+	return hisi_pcie_get_real_event(target) == hisi_pcie_get_real_event(event);
+}
+
+static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct perf_event *event_group[HISI_PCIE_MAX_COUNTERS];
+	int counters = 1;
+	int num;
+
+	event_group[0] = leader;
+	if (!is_software_event(leader)) {
+		if (leader->pmu != event->pmu)
+			return false;
+
+		if (leader != event && !hisi_pcie_pmu_cmp_event(leader, event))
+			event_group[counters++] = event;
+	}
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (is_software_event(sibling))
+			continue;
+
+		if (sibling->pmu != event->pmu)
+			return false;
+
+		for (num = 0; num < counters; num++) {
+			if (hisi_pcie_pmu_cmp_event(event_group[num], sibling))
+				break;
+		}
+
+		if (num == counters)
+			event_group[counters++] = sibling;
+	}
+
+	return counters <= HISI_PCIE_MAX_COUNTERS;
+}
+
+static int hisi_pcie_pmu_event_init(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	event->cpu = pcie_pmu->on_cpu;
+
+	if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event)))
+		hwc->event_base = HISI_PCIE_EXT_CNT;
+	else
+		hwc->event_base = HISI_PCIE_CNT;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling is not supported. */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EOPNOTSUPP;
+
+	if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu))
+		return -EINVAL;
+
+	if (!hisi_pcie_pmu_validate_event_group(event))
+		return -EINVAL;
+
+	return 0;
+}
+
+static u64 hisi_pcie_pmu_read_counter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	u32 idx = event->hw.idx;
+
+	return hisi_pcie_pmu_readq(pcie_pmu, event->hw.event_base, idx);
+}
+
+static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu,
+					    struct perf_event *event)
+{
+	struct perf_event *sibling;
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		sibling = pcie_pmu->hw_events[idx];
+		if (!sibling)
+			continue;
+
+		if (!hisi_pcie_pmu_cmp_event(sibling, event))
+			continue;
+
+		/* Related events must be used in group */
+		if (sibling->group_leader == event->group_leader)
+			return idx;
+		else
+			return -EINVAL;
+	}
+
+	return idx;
+}
+
+static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu)
+{
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		if (!pcie_pmu->hw_events[idx])
+			return idx;
+	}
+
+	return -EINVAL;
+}
+
+static void hisi_pcie_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 new_cnt, prev_cnt, delta;
+
+	do {
+		prev_cnt = local64_read(&hwc->prev_count);
+		new_cnt = hisi_pcie_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_cnt,
+				 new_cnt) != prev_cnt);
+
+	delta = (new_cnt - prev_cnt) & HISI_PCIE_MAX_PERIOD;
+	local64_add(delta, &event->count);
+}
+
+static void hisi_pcie_pmu_read(struct perf_event *event)
+{
+	hisi_pcie_pmu_event_update(event);
+}
+
+static void hisi_pcie_pmu_set_period(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	local64_set(&hwc->prev_count, HISI_PCIE_INIT_VAL);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, HISI_PCIE_INIT_VAL);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, HISI_PCIE_INIT_VAL);
+}
+
+static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 val;
+
+	val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+	val |= HISI_PCIE_EVENT_EN;
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_disable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 val;
+
+	val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+	val &= ~HISI_PCIE_EVENT_EN;
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_enable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 0);
+}
+
+static void hisi_pcie_pmu_disable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 1);
+}
+
+static void hisi_pcie_pmu_reset_counter(struct hisi_pcie_pmu *pcie_pmu, int idx)
+{
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_RESET_CNT);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_INIT_SET);
+}
+
+static void hisi_pcie_pmu_start(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	u64 prev_cnt;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	hisi_pcie_pmu_config_filter(event);
+	hisi_pcie_pmu_enable_counter(pcie_pmu, hwc);
+	hisi_pcie_pmu_enable_int(pcie_pmu, hwc);
+	hisi_pcie_pmu_set_period(event);
+
+	if (flags & PERF_EF_RELOAD) {
+		prev_cnt = local64_read(&hwc->prev_count);
+		hisi_pcie_pmu_writeq(pcie_pmu, hwc->event_base, idx, prev_cnt);
+	}
+
+	perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_event_update(event);
+	hisi_pcie_pmu_disable_int(pcie_pmu, hwc);
+	hisi_pcie_pmu_disable_counter(pcie_pmu, hwc);
+	hisi_pcie_pmu_clear_filter(event);
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int hisi_pcie_pmu_add(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	/* Check all working events to find a related event. */
+	idx = hisi_pcie_pmu_find_related_event(pcie_pmu, event);
+	if (idx < 0)
+		return idx;
+
+	/* Current event shares an enabled counter with the related event */
+	if (idx < HISI_PCIE_MAX_COUNTERS) {
+		hwc->idx = idx;
+		goto start_count;
+	}
+
+	idx = hisi_pcie_pmu_get_event_idx(pcie_pmu);
+	if (idx < 0)
+		return idx;
+
+	hwc->idx = idx;
+	pcie_pmu->hw_events[idx] = event;
+	/* Reset Counter to avoid previous statistic interference. */
+	hisi_pcie_pmu_reset_counter(pcie_pmu, idx);
+
+start_count:
+	if (flags & PERF_EF_START)
+		hisi_pcie_pmu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void hisi_pcie_pmu_del(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_stop(event, PERF_EF_UPDATE);
+	pcie_pmu->hw_events[hwc->idx] = NULL;
+	perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_enable(struct pmu *pmu)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+	int num;
+
+	for (num = 0; num < HISI_PCIE_MAX_COUNTERS; num++) {
+		if (pcie_pmu->hw_events[num])
+			break;
+	}
+
+	if (num == HISI_PCIE_MAX_COUNTERS)
+		return;
+
+	writel(HISI_PCIE_GLOBAL_EN, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static void hisi_pcie_pmu_disable(struct pmu *pmu)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+
+	writel(HISI_PCIE_GLOBAL_NONE, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static irqreturn_t hisi_pcie_pmu_irq(int irq, void *data)
+{
+	struct hisi_pcie_pmu *pcie_pmu = data;
+	irqreturn_t ret = IRQ_NONE;
+	struct perf_event *event;
+	u32 overflown;
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		overflown = hisi_pcie_pmu_readl(pcie_pmu, HISI_PCIE_INT_STAT, idx);
+		if (!overflown)
+			continue;
+
+		/* Clear status of interrupt. */
+		hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_STAT, idx, 1);
+		event = pcie_pmu->hw_events[idx];
+		if (!event)
+			continue;
+
+		hisi_pcie_pmu_event_update(event);
+		hisi_pcie_pmu_set_period(event);
+		ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
+static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	int irq, ret;
+
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+	if (ret < 0) {
+		pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret);
+		return ret;
+	}
+
+	irq = pci_irq_vector(pdev, 0);
+	ret = request_irq(irq, hisi_pcie_pmu_irq, IRQF_NOBALANCING | IRQF_NO_THREAD, DRV_NAME,
+			  pcie_pmu);
+	if (ret) {
+		pci_err(pdev, "Failed to register IRQ: %d\n", ret);
+		pci_free_irq_vectors(pdev);
+		return ret;
+	}
+
+	pcie_pmu->irq = irq;
+
+	return 0;
+}
+
+static void hisi_pcie_pmu_irq_unregister(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	free_irq(pcie_pmu->irq, pcie_pmu);
+	pci_free_irq_vectors(pdev);
+}
+
+static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
+
+	if (pcie_pmu->on_cpu == -1) {
+		pcie_pmu->on_cpu = cpu;
+		WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu)));
+	}
+
+	return 0;
+}
+
+static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
+	unsigned int target;
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (pcie_pmu->on_cpu != cpu)
+		return 0;
+
+	pcie_pmu->on_cpu = -1;
+	/* Choose a new CPU from all online cpus. */
+	target = cpumask_first(cpu_online_mask);
+	if (target >= nr_cpu_ids) {
+		pci_err(pcie_pmu->pdev, "There is no CPU to set\n");
+		return 0;
+	}
+
+	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
+	/* Use this CPU for event counting */
+	pcie_pmu->on_cpu = target;
+	WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(target)));
+
+	return 0;
+}
+
+static struct attribute *hisi_pcie_pmu_events_attr[] = {
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_latency, 0x0010),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_cnt, 0x10010),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_latency, 0x0210),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_cnt, 0x10210),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_latency, 0x0011),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_cnt, 0x10011),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_flux, 0x1005),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_time, 0x11005),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_flux, 0x2004),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_time, 0x12004),
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_pcie_pmu_events_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_format_attr[] = {
+	HISI_PCIE_PMU_FORMAT_ATTR(event, "config:0-16"),
+	HISI_PCIE_PMU_FORMAT_ATTR(thr_len, "config1:0-3"),
+	HISI_PCIE_PMU_FORMAT_ATTR(thr_mode, "config1:4"),
+	HISI_PCIE_PMU_FORMAT_ATTR(trig_len, "config1:5-8"),
+	HISI_PCIE_PMU_FORMAT_ATTR(trig_mode, "config1:9"),
+	HISI_PCIE_PMU_FORMAT_ATTR(port, "config2:0-15"),
+	HISI_PCIE_PMU_FORMAT_ATTR(bdf, "config2:16-31"),
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_pcie_pmu_format_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_bus_attrs[] = {
+	&dev_attr_bus.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_bus_attr_group = {
+	.attrs = hisi_pcie_pmu_bus_attrs,
+};
+
+static struct attribute *hisi_pcie_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_cpumask_attr_group = {
+	.attrs = hisi_pcie_pmu_cpumask_attrs,
+};
+
+static struct attribute *hisi_pcie_pmu_identifier_attrs[] = {
+	&dev_attr_identifier.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_identifier_attr_group = {
+	.attrs = hisi_pcie_pmu_identifier_attrs,
+};
+
+static const struct attribute_group *hisi_pcie_pmu_attr_groups[] = {
+	&hisi_pcie_pmu_events_group,
+	&hisi_pcie_pmu_format_group,
+	&hisi_pcie_pmu_bus_attr_group,
+	&hisi_pcie_pmu_cpumask_attr_group,
+	&hisi_pcie_pmu_identifier_attr_group,
+	NULL
+};
+
+static int hisi_pcie_alloc_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	struct hisi_pcie_reg_pair regs;
+	u16 sicl_id, core_id;
+	char *name;
+
+	regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_BDF);
+	pcie_pmu->bdf_min = regs.lo;
+	pcie_pmu->bdf_max = regs.hi;
+
+	regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_INFO);
+	sicl_id = regs.hi;
+	core_id = regs.lo;
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_pcie%u_core%u", sicl_id, core_id);
+	if (!name)
+		return -ENOMEM;
+
+	pcie_pmu->pdev = pdev;
+	pcie_pmu->on_cpu = -1;
+	pcie_pmu->identifier = readl(pcie_pmu->base + HISI_PCIE_REG_VERSION);
+	pcie_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.module		= THIS_MODULE,
+		.event_init	= hisi_pcie_pmu_event_init,
+		.pmu_enable	= hisi_pcie_pmu_enable,
+		.pmu_disable	= hisi_pcie_pmu_disable,
+		.add		= hisi_pcie_pmu_add,
+		.del		= hisi_pcie_pmu_del,
+		.start		= hisi_pcie_pmu_start,
+		.stop		= hisi_pcie_pmu_stop,
+		.read		= hisi_pcie_pmu_read,
+		.task_ctx_nr	= perf_invalid_context,
+		.attr_groups	= hisi_pcie_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	return 0;
+}
+
+static int hisi_pcie_init_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	int ret;
+
+	pcie_pmu->base = pci_ioremap_bar(pdev, 2);
+	if (!pcie_pmu->base) {
+		pci_err(pdev, "Ioremap failed for pcie_pmu resource\n");
+		return -ENOMEM;
+	}
+
+	ret = hisi_pcie_alloc_pmu(pdev, pcie_pmu);
+	if (ret)
+		goto err_iounmap;
+
+	ret = hisi_pcie_pmu_irq_register(pdev, pcie_pmu);
+	if (ret)
+		goto err_iounmap;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+	if (ret) {
+		pci_err(pdev, "Failed to register hotplug: %d\n", ret);
+		goto err_irq_unregister;
+	}
+
+	ret = perf_pmu_register(&pcie_pmu->pmu, pcie_pmu->pmu.name, -1);
+	if (ret) {
+		pci_err(pdev, "Failed to register PCIe PMU: %d\n", ret);
+		goto err_hotplug_unregister;
+	}
+
+	return ret;
+
+err_hotplug_unregister:
+	cpuhp_state_remove_instance_nocalls(
+		CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+
+err_irq_unregister:
+	hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+
+err_iounmap:
+	iounmap(pcie_pmu->base);
+
+	return ret;
+}
+
+static void hisi_pcie_uninit_pmu(struct pci_dev *pdev)
+{
+	struct hisi_pcie_pmu *pcie_pmu = pci_get_drvdata(pdev);
+
+	perf_pmu_unregister(&pcie_pmu->pmu);
+	cpuhp_state_remove_instance_nocalls(
+		CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+	hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+	iounmap(pcie_pmu->base);
+}
+
+static int hisi_pcie_init_dev(struct pci_dev *pdev)
+{
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		pci_err(pdev, "Failed to enable PCI device: %d\n", ret);
+		return ret;
+	}
+
+	ret = pcim_iomap_regions(pdev, BIT(2), DRV_NAME);
+	if (ret < 0) {
+		pci_err(pdev, "Failed to request PCI mem regions: %d\n", ret);
+		return ret;
+	}
+
+	pci_set_master(pdev);
+
+	return 0;
+}
+
+static int hisi_pcie_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct hisi_pcie_pmu *pcie_pmu;
+	int ret;
+
+	pcie_pmu = devm_kzalloc(&pdev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
+	if (!pcie_pmu)
+		return -ENOMEM;
+
+	ret = hisi_pcie_init_dev(pdev);
+	if (ret)
+		return ret;
+
+	ret = hisi_pcie_init_pmu(pdev, pcie_pmu);
+	if (ret)
+		return ret;
+
+	pci_set_drvdata(pdev, pcie_pmu);
+
+	return ret;
+}
+
+static void hisi_pcie_pmu_remove(struct pci_dev *pdev)
+{
+	hisi_pcie_uninit_pmu(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id hisi_pcie_pmu_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa12d) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, hisi_pcie_pmu_ids);
+
+static struct pci_driver hisi_pcie_pmu_driver = {
+	.name = DRV_NAME,
+	.id_table = hisi_pcie_pmu_ids,
+	.probe = hisi_pcie_pmu_probe,
+	.remove = hisi_pcie_pmu_remove,
+};
+
+static int __init hisi_pcie_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+				      "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE",
+				      hisi_pcie_pmu_online_cpu,
+				      hisi_pcie_pmu_offline_cpu);
+	if (ret) {
+		pr_err("Failed to setup PCIe PMU hotplug: %d\n", ret);
+		return ret;
+	}
+
+	ret = pci_register_driver(&hisi_pcie_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+
+	return ret;
+}
+module_init(hisi_pcie_module_init);
+
+static void __exit hisi_pcie_module_exit(void)
+{
+	pci_unregister_driver(&hisi_pcie_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+}
+module_exit(hisi_pcie_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon PCIe PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Qi Liu <liuqi115@huawei.com>");
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
new file mode 100644
index 000000000000..7f4d292658e3
--- /dev/null
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell CN10K LLC-TAD perf driver
+ *
+ * Copyright (C) 2021 Marvell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "tad_pmu: " fmt
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/cpuhotplug.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define TAD_PFC_OFFSET		0x0
+#define TAD_PFC(counter)	(TAD_PFC_OFFSET | (counter << 3))
+#define TAD_PRF_OFFSET		0x100
+#define TAD_PRF(counter)	(TAD_PRF_OFFSET | (counter << 3))
+#define TAD_PRF_CNTSEL_MASK	0xFF
+#define TAD_MAX_COUNTERS	8
+
+#define to_tad_pmu(p) (container_of(p, struct tad_pmu, pmu))
+
+struct tad_region {
+	void __iomem	*base;
+};
+
+struct tad_pmu {
+	struct pmu pmu;
+	struct tad_region *regions;
+	u32 region_cnt;
+	unsigned int cpu;
+	struct hlist_node node;
+	struct perf_event *events[TAD_MAX_COUNTERS];
+	DECLARE_BITMAP(counters_map, TAD_MAX_COUNTERS);
+};
+
+static int tad_pmu_cpuhp_state;
+
+static void tad_pmu_event_counter_read(struct perf_event *event)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 counter_idx = hwc->idx;
+	u64 prev, new;
+	int i;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
+			new += readq(tad_pmu->regions[i].base +
+				     TAD_PFC(counter_idx));
+	} while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
+
+	local64_add(new - prev, &event->count);
+}
+
+static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 counter_idx = hwc->idx;
+	int i;
+
+	/* TAD()_PFC() stop counting on the write
+	 * which sets TAD()_PRF()[CNTSEL] == 0
+	 */
+	for (i = 0; i < tad_pmu->region_cnt; i++) {
+		writeq_relaxed(0, tad_pmu->regions[i].base +
+			       TAD_PRF(counter_idx));
+	}
+
+	tad_pmu_event_counter_read(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 event_idx = event->attr.config;
+	u32 counter_idx = hwc->idx;
+	u64 reg_val;
+	int i;
+
+	hwc->state = 0;
+
+	/* Typically TAD_PFC() are zeroed to start counting */
+	for (i = 0; i < tad_pmu->region_cnt; i++)
+		writeq_relaxed(0, tad_pmu->regions[i].base +
+			       TAD_PFC(counter_idx));
+
+	/* TAD()_PFC() start counting on the write
+	 * which sets TAD()_PRF()[CNTSEL] != 0
+	 */
+	for (i = 0; i < tad_pmu->region_cnt; i++) {
+		reg_val = readq_relaxed(tad_pmu->regions[i].base +
+					TAD_PRF(counter_idx));
+		reg_val |= (event_idx & 0xFF);
+		writeq_relaxed(reg_val,	tad_pmu->regions[i].base +
+			       TAD_PRF(counter_idx));
+	}
+}
+
+static void tad_pmu_event_counter_del(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	tad_pmu_event_counter_stop(event, flags | PERF_EF_UPDATE);
+	tad_pmu->events[idx] = NULL;
+	clear_bit(idx, tad_pmu->counters_map);
+}
+
+static int tad_pmu_event_counter_add(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	/* Get a free counter for this event */
+	idx = find_first_zero_bit(tad_pmu->counters_map, TAD_MAX_COUNTERS);
+	if (idx == TAD_MAX_COUNTERS)
+		return -EAGAIN;
+
+	set_bit(idx, tad_pmu->counters_map);
+
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED;
+	tad_pmu->events[idx] = event;
+
+	if (flags & PERF_EF_START)
+		tad_pmu_event_counter_start(event, flags);
+
+	return 0;
+}
+
+static int tad_pmu_event_init(struct perf_event *event)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+
+	if (!event->attr.disabled)
+		return -EINVAL;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (event->state != PERF_EVENT_STATE_OFF)
+		return -EINVAL;
+
+	event->cpu = tad_pmu->cpu;
+	event->hw.idx = -1;
+	event->hw.config_base = event->attr.config;
+
+	return 0;
+}
+
+static ssize_t tad_pmu_event_show(struct device *dev,
+				struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define TAD_PMU_EVENT_ATTR(name, config)			\
+	PMU_EVENT_ATTR_ID(name, tad_pmu_event_show, config)
+
+static struct attribute *tad_pmu_event_attrs[] = {
+	TAD_PMU_EVENT_ATTR(tad_none, 0x0),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_in_any, 0x1),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_in_mn, 0x2),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_in_exlmn, 0x3),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_any, 0x4),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_mn, 0x5),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_exlmn, 0x6),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_dss, 0x7),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_retry_dss, 0x8),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_in_any, 0x9),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_in_dss, 0xa),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_any, 0xb),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_rd, 0xc),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_wr, 0xd),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_evict, 0xe),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_any, 0xf),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_exlmn, 0x10),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_mn, 0x11),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_exlmn, 0x12),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_mn, 0x13),
+	TAD_PMU_EVENT_ATTR(tad_snp_msh_out_any, 0x14),
+	TAD_PMU_EVENT_ATTR(tad_snp_msh_out_mn, 0x15),
+	TAD_PMU_EVENT_ATTR(tad_snp_msh_out_exlmn, 0x16),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_out_any, 0x17),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_out_fill, 0x18),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_out_dss, 0x19),
+	TAD_PMU_EVENT_ATTR(tad_alloc_dtg, 0x1a),
+	TAD_PMU_EVENT_ATTR(tad_alloc_ltg, 0x1b),
+	TAD_PMU_EVENT_ATTR(tad_alloc_any, 0x1c),
+	TAD_PMU_EVENT_ATTR(tad_hit_dtg, 0x1d),
+	TAD_PMU_EVENT_ATTR(tad_hit_ltg, 0x1e),
+	TAD_PMU_EVENT_ATTR(tad_hit_any, 0x1f),
+	TAD_PMU_EVENT_ATTR(tad_tag_rd, 0x20),
+	TAD_PMU_EVENT_ATTR(tad_dat_rd, 0x21),
+	TAD_PMU_EVENT_ATTR(tad_dat_rd_byp, 0x22),
+	TAD_PMU_EVENT_ATTR(tad_ifb_occ, 0x23),
+	TAD_PMU_EVENT_ATTR(tad_req_occ, 0x24),
+	NULL
+};
+
+static const struct attribute_group tad_pmu_events_attr_group = {
+	.name = "events",
+	.attrs = tad_pmu_event_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *tad_pmu_format_attrs[] = {
+	&format_attr_event.attr,
+	NULL
+};
+
+static struct attribute_group tad_pmu_format_attr_group = {
+	.name = "format",
+	.attrs = tad_pmu_format_attrs,
+};
+
+static ssize_t tad_pmu_cpumask_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(tad_pmu->cpu));
+}
+
+static DEVICE_ATTR(cpumask, 0444, tad_pmu_cpumask_show, NULL);
+
+static struct attribute *tad_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static struct attribute_group tad_pmu_cpumask_attr_group = {
+	.attrs = tad_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *tad_pmu_attr_groups[] = {
+	&tad_pmu_events_attr_group,
+	&tad_pmu_format_attr_group,
+	&tad_pmu_cpumask_attr_group,
+	NULL
+};
+
+static int tad_pmu_probe(struct platform_device *pdev)
+{
+	struct device_node *node = pdev->dev.of_node;
+	struct tad_region *regions;
+	struct tad_pmu *tad_pmu;
+	struct resource *res;
+	u32 tad_pmu_page_size;
+	u32 tad_page_size;
+	u32 tad_cnt;
+	int i, ret;
+	char *name;
+
+	tad_pmu = devm_kzalloc(&pdev->dev, sizeof(*tad_pmu), GFP_KERNEL);
+	if (!tad_pmu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, tad_pmu);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "Mem resource not found\n");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tad-page-size",
+				   &tad_page_size);
+	if (ret) {
+		dev_err(&pdev->dev, "Can't find tad-page-size property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tad-pmu-page-size",
+				   &tad_pmu_page_size);
+	if (ret) {
+		dev_err(&pdev->dev, "Can't find tad-pmu-page-size property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tad-cnt", &tad_cnt);
+	if (ret) {
+		dev_err(&pdev->dev, "Can't find tad-cnt property\n");
+		return ret;
+	}
+
+	regions = devm_kcalloc(&pdev->dev, tad_cnt,
+			       sizeof(*regions), GFP_KERNEL);
+	if (!regions)
+		return -ENOMEM;
+
+	/* ioremap the distributed TAD pmu regions */
+	for (i = 0; i < tad_cnt && res->start < res->end; i++) {
+		regions[i].base = devm_ioremap(&pdev->dev,
+					       res->start,
+					       tad_pmu_page_size);
+		if (!regions[i].base) {
+			dev_err(&pdev->dev, "TAD%d ioremap fail\n", i);
+			return -ENOMEM;
+		}
+		res->start += tad_page_size;
+	}
+
+	tad_pmu->regions = regions;
+	tad_pmu->region_cnt = tad_cnt;
+
+	tad_pmu->pmu = (struct pmu) {
+
+		.module		= THIS_MODULE,
+		.attr_groups	= tad_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
+				  PERF_PMU_CAP_NO_INTERRUPT,
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= tad_pmu_event_init,
+		.add		= tad_pmu_event_counter_add,
+		.del		= tad_pmu_event_counter_del,
+		.start		= tad_pmu_event_counter_start,
+		.stop		= tad_pmu_event_counter_stop,
+		.read		= tad_pmu_event_counter_read,
+	};
+
+	tad_pmu->cpu = raw_smp_processor_id();
+
+	/* Register pmu instance for cpu hotplug */
+	ret = cpuhp_state_add_instance_nocalls(tad_pmu_cpuhp_state,
+					       &tad_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	name = "tad";
+	ret = perf_pmu_register(&tad_pmu->pmu, name, -1);
+	if (ret)
+		cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
+						    &tad_pmu->node);
+
+	return ret;
+}
+
+static int tad_pmu_remove(struct platform_device *pdev)
+{
+	struct tad_pmu *pmu = platform_get_drvdata(pdev);
+
+	cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
+						&pmu->node);
+	perf_pmu_unregister(&pmu->pmu);
+
+	return 0;
+}
+
+static const struct of_device_id tad_pmu_of_match[] = {
+	{ .compatible = "marvell,cn10k-tad-pmu", },
+	{},
+};
+
+static struct platform_driver tad_pmu_driver = {
+	.driver         = {
+		.name   = "cn10k_tad_pmu",
+		.of_match_table = of_match_ptr(tad_pmu_of_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe          = tad_pmu_probe,
+	.remove         = tad_pmu_remove,
+};
+
+static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct tad_pmu *pmu = hlist_entry_safe(node, struct tad_pmu, node);
+	unsigned int target;
+
+	if (cpu != pmu->cpu)
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+	pmu->cpu = target;
+
+	return 0;
+}
+
+static int __init tad_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/cn10k/tadpmu:online",
+				      NULL,
+				      tad_pmu_offline_cpu);
+	if (ret < 0)
+		return ret;
+	tad_pmu_cpuhp_state = ret;
+	return platform_driver_register(&tad_pmu_driver);
+}
+
+static void __exit tad_pmu_exit(void)
+{
+	platform_driver_unregister(&tad_pmu_driver);
+	cpuhp_remove_multi_state(tad_pmu_cpuhp_state);
+}
+
+module_init(tad_pmu_init);
+module_exit(tad_pmu_exit);
+
+MODULE_DESCRIPTION("Marvell CN10K LLC-TAD Perf driver");
+MODULE_AUTHOR("Bhaskara Budiredla <bbudiredla@marvell.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 640f09479bdf..4c2c1b830344 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -251,5 +251,16 @@ do {									\
 #define pmem_wmb()	wmb()
 #endif
 
+/*
+ * ioremap_wc() maps I/O memory as memory with write-combining attributes. For
+ * this kind of memory accesses, the CPU may wait for prior accesses to be
+ * merged with subsequent ones. In some situation, such wait is bad for the
+ * performance. io_stop_wc() can be used to prevent the merging of
+ * write-combining memory accesses before this macro with those after it.
+ */
+#ifndef io_stop_wc
+#define io_stop_wc do { } while (0)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __ASM_GENERIC_BARRIER_H */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 773c83730906..411a428ace4d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -225,6 +225,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dcfd265beed..411e34210fbf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -130,6 +130,15 @@ struct hw_perf_event_extra {
 };
 
 /**
+ * hw_perf_event::flag values
+ *
+ * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
+ * usage.
+ */
+#define PERF_EVENT_FLAG_ARCH			0x0000ffff
+#define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
+
+/**
  * struct hw_perf_event - performance event hardware details:
  */
 struct hw_perf_event {
@@ -822,6 +831,7 @@ struct perf_event_context {
 
 	int				nr_events;
 	int				nr_active;
+	int				nr_user;
 	int				is_active;
 	int				nr_stat;
 	int				nr_freq;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index bef158815e83..97455880ac41 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -8,22 +8,6 @@
 struct task_struct;
 struct pt_regs;
 
-#ifdef CONFIG_STACKTRACE
-void stack_trace_print(const unsigned long *trace, unsigned int nr_entries,
-		       int spaces);
-int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
-			unsigned int nr_entries, int spaces);
-unsigned int stack_trace_save(unsigned long *store, unsigned int size,
-			      unsigned int skipnr);
-unsigned int stack_trace_save_tsk(struct task_struct *task,
-				  unsigned long *store, unsigned int size,
-				  unsigned int skipnr);
-unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
-				   unsigned int size, unsigned int skipnr);
-unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
-/* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
 
 /**
@@ -76,8 +60,25 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
 
 void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
 			  const struct pt_regs *regs);
+#endif /* CONFIG_ARCH_STACKWALK */
 
-#else /* CONFIG_ARCH_STACKWALK */
+#ifdef CONFIG_STACKTRACE
+void stack_trace_print(const unsigned long *trace, unsigned int nr_entries,
+		       int spaces);
+int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
+			unsigned int nr_entries, int spaces);
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+			      unsigned int skipnr);
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+				  unsigned long *store, unsigned int size,
+				  unsigned int skipnr);
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+				   unsigned int size, unsigned int skipnr);
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
+
+#ifndef CONFIG_ARCH_STACKWALK
+/* Internal interfaces. Do not use in generic code */
 struct stack_trace {
 	unsigned int nr_entries, max_entries;
 	unsigned long *entries;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30d94f68c5bd..1362b9b770d8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1808,6 +1808,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
+	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+		ctx->nr_user++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
 
@@ -1999,6 +2001,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
 	ctx->nr_events--;
+	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+		ctx->nr_user--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
 
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index c2bb07f5bcc7..e893b0e1d62a 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,6 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 
 CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+	$(call cc-option,-mno-outline-atomics) \
 	-fno-stack-protector -DDISABLE_BRANCH_PROFILING
 
 obj-y := core.o debugfs.o report.o
diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index ced910fb4019..1e8d9a8f59df 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
 else
 ARM64_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/arm64/abi/.gitignore b/tools/testing/selftests/arm64/abi/.gitignore
new file mode 100644
index 000000000000..b79cf5814c23
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/.gitignore
@@ -0,0 +1 @@
+syscall-abi
diff --git a/tools/testing/selftests/arm64/abi/Makefile b/tools/testing/selftests/arm64/abi/Makefile
new file mode 100644
index 000000000000..96eba974ac8d
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 ARM Limited
+
+TEST_GEN_PROGS := syscall-abi
+
+include ../../lib.mk
+
+$(OUTPUT)/syscall-abi: syscall-abi.c syscall-abi-asm.S
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
new file mode 100644
index 000000000000..983467cfcee0
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021 ARM Limited.
+//
+// Assembly portion of the syscall ABI test
+
+//
+// Load values from memory into registers, invoke a syscall and save the
+// register values back to memory for later checking.  The syscall to be
+// invoked is configured in x8 of the input GPR data.
+//
+// x0:	SVE VL, 0 for FP only
+//
+//	GPRs:	gpr_in, gpr_out
+//	FPRs:	fpr_in, fpr_out
+//	Zn:	z_in, z_out
+//	Pn:	p_in, p_out
+//	FFR:	ffr_in, ffr_out
+
+.arch_extension sve
+
+.globl do_syscall
+do_syscall:
+	// Store callee saved registers x19-x29 (80 bytes) plus x0 and x1
+	stp	x29, x30, [sp, #-112]!
+	mov	x29, sp
+	stp	x0, x1, [sp, #16]
+	stp	x19, x20, [sp, #32]
+	stp	x21, x22, [sp, #48]
+	stp	x23, x24, [sp, #64]
+	stp	x25, x26, [sp, #80]
+	stp	x27, x28, [sp, #96]
+
+	// Load GPRs x8-x28, and save our SP/FP for later comparison
+	ldr	x2, =gpr_in
+	add	x2, x2, #64
+	ldp	x8, x9, [x2], #16
+	ldp	x10, x11, [x2], #16
+	ldp	x12, x13, [x2], #16
+	ldp	x14, x15, [x2], #16
+	ldp	x16, x17, [x2], #16
+	ldp	x18, x19, [x2], #16
+	ldp	x20, x21, [x2], #16
+	ldp	x22, x23, [x2], #16
+	ldp	x24, x25, [x2], #16
+	ldp	x26, x27, [x2], #16
+	ldr	x28, [x2], #8
+	str	x29, [x2], #8		// FP
+	str	x30, [x2], #8		// LR
+
+	// Load FPRs if we're not doing SVE
+	cbnz	x0, 1f
+	ldr	x2, =fpr_in
+	ldp	q0, q1, [x2]
+	ldp	q2, q3, [x2, #16 * 2]
+	ldp	q4, q5, [x2, #16 * 4]
+	ldp	q6, q7, [x2, #16 * 6]
+	ldp	q8, q9, [x2, #16 * 8]
+	ldp	q10, q11, [x2, #16 * 10]
+	ldp	q12, q13, [x2, #16 * 12]
+	ldp	q14, q15, [x2, #16 * 14]
+	ldp	q16, q17, [x2, #16 * 16]
+	ldp	q18, q19, [x2, #16 * 18]
+	ldp	q20, q21, [x2, #16 * 20]
+	ldp	q22, q23, [x2, #16 * 22]
+	ldp	q24, q25, [x2, #16 * 24]
+	ldp	q26, q27, [x2, #16 * 26]
+	ldp	q28, q29, [x2, #16 * 28]
+	ldp	q30, q31, [x2, #16 * 30]
+1:
+
+	// Load the SVE registers if we're doing SVE
+	cbz	x0, 1f
+
+	ldr	x2, =z_in
+	ldr	z0, [x2, #0, MUL VL]
+	ldr	z1, [x2, #1, MUL VL]
+	ldr	z2, [x2, #2, MUL VL]
+	ldr	z3, [x2, #3, MUL VL]
+	ldr	z4, [x2, #4, MUL VL]
+	ldr	z5, [x2, #5, MUL VL]
+	ldr	z6, [x2, #6, MUL VL]
+	ldr	z7, [x2, #7, MUL VL]
+	ldr	z8, [x2, #8, MUL VL]
+	ldr	z9, [x2, #9, MUL VL]
+	ldr	z10, [x2, #10, MUL VL]
+	ldr	z11, [x2, #11, MUL VL]
+	ldr	z12, [x2, #12, MUL VL]
+	ldr	z13, [x2, #13, MUL VL]
+	ldr	z14, [x2, #14, MUL VL]
+	ldr	z15, [x2, #15, MUL VL]
+	ldr	z16, [x2, #16, MUL VL]
+	ldr	z17, [x2, #17, MUL VL]
+	ldr	z18, [x2, #18, MUL VL]
+	ldr	z19, [x2, #19, MUL VL]
+	ldr	z20, [x2, #20, MUL VL]
+	ldr	z21, [x2, #21, MUL VL]
+	ldr	z22, [x2, #22, MUL VL]
+	ldr	z23, [x2, #23, MUL VL]
+	ldr	z24, [x2, #24, MUL VL]
+	ldr	z25, [x2, #25, MUL VL]
+	ldr	z26, [x2, #26, MUL VL]
+	ldr	z27, [x2, #27, MUL VL]
+	ldr	z28, [x2, #28, MUL VL]
+	ldr	z29, [x2, #29, MUL VL]
+	ldr	z30, [x2, #30, MUL VL]
+	ldr	z31, [x2, #31, MUL VL]
+
+	ldr	x2, =ffr_in
+	ldr	p0, [x2, #0]
+	wrffr	p0.b
+
+	ldr	x2, =p_in
+	ldr	p0, [x2, #0, MUL VL]
+	ldr	p1, [x2, #1, MUL VL]
+	ldr	p2, [x2, #2, MUL VL]
+	ldr	p3, [x2, #3, MUL VL]
+	ldr	p4, [x2, #4, MUL VL]
+	ldr	p5, [x2, #5, MUL VL]
+	ldr	p6, [x2, #6, MUL VL]
+	ldr	p7, [x2, #7, MUL VL]
+	ldr	p8, [x2, #8, MUL VL]
+	ldr	p9, [x2, #9, MUL VL]
+	ldr	p10, [x2, #10, MUL VL]
+	ldr	p11, [x2, #11, MUL VL]
+	ldr	p12, [x2, #12, MUL VL]
+	ldr	p13, [x2, #13, MUL VL]
+	ldr	p14, [x2, #14, MUL VL]
+	ldr	p15, [x2, #15, MUL VL]
+1:
+
+	// Do the syscall
+	svc	#0
+
+	// Save GPRs x8-x30
+	ldr	x2, =gpr_out
+	add	x2, x2, #64
+	stp	x8, x9, [x2], #16
+	stp	x10, x11, [x2], #16
+	stp	x12, x13, [x2], #16
+	stp	x14, x15, [x2], #16
+	stp	x16, x17, [x2], #16
+	stp	x18, x19, [x2], #16
+	stp	x20, x21, [x2], #16
+	stp	x22, x23, [x2], #16
+	stp	x24, x25, [x2], #16
+	stp	x26, x27, [x2], #16
+	stp	x28, x29, [x2], #16
+	str	x30, [x2]
+
+	// Restore x0 and x1 for feature checks
+	ldp	x0, x1, [sp, #16]
+
+	// Save FPSIMD state
+	ldr	x2, =fpr_out
+	stp	q0, q1, [x2]
+	stp	q2, q3, [x2, #16 * 2]
+	stp	q4, q5, [x2, #16 * 4]
+	stp	q6, q7, [x2, #16 * 6]
+	stp	q8, q9, [x2, #16 * 8]
+	stp	q10, q11, [x2, #16 * 10]
+	stp	q12, q13, [x2, #16 * 12]
+	stp	q14, q15, [x2, #16 * 14]
+	stp	q16, q17, [x2, #16 * 16]
+	stp	q18, q19, [x2, #16 * 18]
+	stp	q20, q21, [x2, #16 * 20]
+	stp	q22, q23, [x2, #16 * 22]
+	stp	q24, q25, [x2, #16 * 24]
+	stp	q26, q27, [x2, #16 * 26]
+	stp	q28, q29, [x2, #16 * 28]
+	stp	q30, q31, [x2, #16 * 30]
+
+	// Save the SVE state if we have some
+	cbz	x0, 1f
+
+	ldr	x2, =z_out
+	str	z0, [x2, #0, MUL VL]
+	str	z1, [x2, #1, MUL VL]
+	str	z2, [x2, #2, MUL VL]
+	str	z3, [x2, #3, MUL VL]
+	str	z4, [x2, #4, MUL VL]
+	str	z5, [x2, #5, MUL VL]
+	str	z6, [x2, #6, MUL VL]
+	str	z7, [x2, #7, MUL VL]
+	str	z8, [x2, #8, MUL VL]
+	str	z9, [x2, #9, MUL VL]
+	str	z10, [x2, #10, MUL VL]
+	str	z11, [x2, #11, MUL VL]
+	str	z12, [x2, #12, MUL VL]
+	str	z13, [x2, #13, MUL VL]
+	str	z14, [x2, #14, MUL VL]
+	str	z15, [x2, #15, MUL VL]
+	str	z16, [x2, #16, MUL VL]
+	str	z17, [x2, #17, MUL VL]
+	str	z18, [x2, #18, MUL VL]
+	str	z19, [x2, #19, MUL VL]
+	str	z20, [x2, #20, MUL VL]
+	str	z21, [x2, #21, MUL VL]
+	str	z22, [x2, #22, MUL VL]
+	str	z23, [x2, #23, MUL VL]
+	str	z24, [x2, #24, MUL VL]
+	str	z25, [x2, #25, MUL VL]
+	str	z26, [x2, #26, MUL VL]
+	str	z27, [x2, #27, MUL VL]
+	str	z28, [x2, #28, MUL VL]
+	str	z29, [x2, #29, MUL VL]
+	str	z30, [x2, #30, MUL VL]
+	str	z31, [x2, #31, MUL VL]
+
+	ldr	x2, =p_out
+	str	p0, [x2, #0, MUL VL]
+	str	p1, [x2, #1, MUL VL]
+	str	p2, [x2, #2, MUL VL]
+	str	p3, [x2, #3, MUL VL]
+	str	p4, [x2, #4, MUL VL]
+	str	p5, [x2, #5, MUL VL]
+	str	p6, [x2, #6, MUL VL]
+	str	p7, [x2, #7, MUL VL]
+	str	p8, [x2, #8, MUL VL]
+	str	p9, [x2, #9, MUL VL]
+	str	p10, [x2, #10, MUL VL]
+	str	p11, [x2, #11, MUL VL]
+	str	p12, [x2, #12, MUL VL]
+	str	p13, [x2, #13, MUL VL]
+	str	p14, [x2, #14, MUL VL]
+	str	p15, [x2, #15, MUL VL]
+
+	ldr	x2, =ffr_out
+	rdffr	p0.b
+	str	p0, [x2, #0]
+1:
+
+	// Restore callee saved registers x19-x30
+	ldp	x19, x20, [sp, #32]
+	ldp	x21, x22, [sp, #48]
+	ldp	x23, x24, [sp, #64]
+	ldp	x25, x26, [sp, #80]
+	ldp	x27, x28, [sp, #96]
+	ldp	x29, x30, [sp], #112
+
+	ret
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
new file mode 100644
index 000000000000..d8eeeafb50dc
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 ARM Limited.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <asm/hwcap.h>
+#include <asm/sigcontext.h>
+#include <asm/unistd.h>
+
+#include "../../kselftest.h"
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+#define NUM_VL ((SVE_VQ_MAX - SVE_VQ_MIN) + 1)
+
+extern void do_syscall(int sve_vl);
+
+static void fill_random(void *buf, size_t size)
+{
+	int i;
+	uint32_t *lbuf = buf;
+
+	/* random() returns a 32 bit number regardless of the size of long */
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		lbuf[i] = random();
+}
+
+/*
+ * We also repeat the test for several syscalls to try to expose different
+ * behaviour.
+ */
+static struct syscall_cfg {
+	int syscall_nr;
+	const char *name;
+} syscalls[] = {
+	{ __NR_getpid,		"getpid()" },
+	{ __NR_sched_yield,	"sched_yield()" },
+};
+
+#define NUM_GPR 31
+uint64_t gpr_in[NUM_GPR];
+uint64_t gpr_out[NUM_GPR];
+
+static void setup_gpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(gpr_in, sizeof(gpr_in));
+	gpr_in[8] = cfg->syscall_nr;
+	memset(gpr_out, 0, sizeof(gpr_out));
+}
+
+static int check_gpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	/*
+	 * GPR x0-x7 may be clobbered, and all others should be preserved.
+	 */
+	for (i = 9; i < ARRAY_SIZE(gpr_in); i++) {
+		if (gpr_in[i] != gpr_out[i]) {
+			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n",
+				       cfg->name, sve_vl, i,
+				       gpr_in[i], gpr_out[i]);
+			errors++;
+		}
+	}
+
+	return errors;
+}
+
+#define NUM_FPR 32
+uint64_t fpr_in[NUM_FPR * 2];
+uint64_t fpr_out[NUM_FPR * 2];
+
+static void setup_fpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(fpr_in, sizeof(fpr_in));
+	memset(fpr_out, 0, sizeof(fpr_out));
+}
+
+static int check_fpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	if (!sve_vl) {
+		for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
+			if (fpr_in[i] != fpr_out[i]) {
+				ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
+					       cfg->name,
+					       i / 2, i % 2,
+					       fpr_in[i], fpr_out[i]);
+				errors++;
+			}
+		}
+	}
+
+	return errors;
+}
+
+static uint8_t z_zero[__SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t z_in[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t z_out[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_z(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(z_in, sizeof(z_in));
+	fill_random(z_out, sizeof(z_out));
+}
+
+static int check_z(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vl;
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/*
+	 * After a syscall the low 128 bits of the Z registers should
+	 * be preserved and the rest be zeroed or preserved.
+	 */
+	for (i = 0; i < SVE_NUM_ZREGS; i++) {
+		void *in = &z_in[reg_size * i];
+		void *out = &z_out[reg_size * i];
+
+		if (memcmp(in, out, SVE_VQ_BYTES) != 0) {
+			ksft_print_msg("%s SVE VL %d Z%d low 128 bits changed\n",
+				       cfg->name, sve_vl, i);
+			errors++;
+		}
+	}
+
+	return errors;
+}
+
+uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_p(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(p_in, sizeof(p_in));
+	fill_random(p_out, sizeof(p_out));
+}
+
+static int check_p(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vq_from_vl(sve_vl) * 2; /* 1 bit per VL byte */
+
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/* After a syscall the P registers should be preserved or zeroed */
+	for (i = 0; i < SVE_NUM_PREGS * reg_size; i++)
+		if (p_out[i] && (p_in[i] != p_out[i]))
+			errors++;
+	if (errors)
+		ksft_print_msg("%s SVE VL %d predicate registers non-zero\n",
+			       cfg->name, sve_vl);
+
+	return errors;
+}
+
+uint8_t ffr_in[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t ffr_out[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_ffr(struct syscall_cfg *cfg, int sve_vl)
+{
+	/*
+	 * It is only valid to set a contiguous set of bits starting
+	 * at 0.  For now since we're expecting this to be cleared by
+	 * a syscall just set all bits.
+	 */
+	memset(ffr_in, 0xff, sizeof(ffr_in));
+	fill_random(ffr_out, sizeof(ffr_out));
+}
+
+static int check_ffr(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vq_from_vl(sve_vl) * 2;  /* 1 bit per VL byte */
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/* After a syscall the P registers should be preserved or zeroed */
+	for (i = 0; i < reg_size; i++)
+		if (ffr_out[i] && (ffr_in[i] != ffr_out[i]))
+			errors++;
+	if (errors)
+		ksft_print_msg("%s SVE VL %d FFR non-zero\n",
+			       cfg->name, sve_vl);
+
+	return errors;
+}
+
+typedef void (*setup_fn)(struct syscall_cfg *cfg, int sve_vl);
+typedef int (*check_fn)(struct syscall_cfg *cfg, int sve_vl);
+
+/*
+ * Each set of registers has a setup function which is called before
+ * the syscall to fill values in a global variable for loading by the
+ * test code and a check function which validates that the results are
+ * as expected.  Vector lengths are passed everywhere, a vector length
+ * of 0 should be treated as do not test.
+ */
+static struct {
+	setup_fn setup;
+	check_fn check;
+} regset[] = {
+	{ setup_gpr, check_gpr },
+	{ setup_fpr, check_fpr },
+	{ setup_z, check_z },
+	{ setup_p, check_p },
+	{ setup_ffr, check_ffr },
+};
+
+static bool do_test(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(regset); i++)
+		regset[i].setup(cfg, sve_vl);
+
+	do_syscall(sve_vl);
+
+	for (i = 0; i < ARRAY_SIZE(regset); i++)
+		errors += regset[i].check(cfg, sve_vl);
+
+	return errors == 0;
+}
+
+static void test_one_syscall(struct syscall_cfg *cfg)
+{
+	int sve_vq, sve_vl;
+
+	/* FPSIMD only case */
+	ksft_test_result(do_test(cfg, 0),
+			 "%s FPSIMD\n", cfg->name);
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+		return;
+
+	for (sve_vq = SVE_VQ_MAX; sve_vq > 0; --sve_vq) {
+		sve_vl = prctl(PR_SVE_SET_VL, sve_vq * 16);
+		if (sve_vl == -1)
+			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		sve_vl &= PR_SVE_VL_LEN_MASK;
+
+		if (sve_vq != sve_vq_from_vl(sve_vl))
+			sve_vq = sve_vq_from_vl(sve_vl);
+
+		ksft_test_result(do_test(cfg, sve_vl),
+				 "%s SVE VL %d\n", cfg->name, sve_vl);
+	}
+}
+
+int sve_count_vls(void)
+{
+	unsigned int vq;
+	int vl_count = 0;
+	int vl;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+		return 0;
+
+	/*
+	 * Enumerate up to SVE_VQ_MAX vector lengths
+	 */
+	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+		vl = prctl(PR_SVE_SET_VL, vq * 16);
+		if (vl == -1)
+			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		vl &= PR_SVE_VL_LEN_MASK;
+
+		if (vq != sve_vq_from_vl(vl))
+			vq = sve_vq_from_vl(vl);
+
+		vl_count++;
+	}
+
+	return vl_count;
+}
+
+int main(void)
+{
+	int i;
+
+	srandom(getpid());
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(syscalls) * (sve_count_vls() + 1));
+
+	for (i = 0; i < ARRAY_SIZE(syscalls); i++)
+		test_one_syscall(&syscalls[i]);
+
+	ksft_print_cnts();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index b67395903b9b..c50d86331ed2 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -1,3 +1,4 @@
+fp-pidbench
 fpsimd-test
 rdvl-sve
 sve-probe-vls
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index ba1488c7c315..95f0b877a060 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -2,13 +2,15 @@
 
 CFLAGS += -I../../../../../usr/include/
 TEST_GEN_PROGS := sve-ptrace sve-probe-vls vec-syscfg
-TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress \
+TEST_PROGS_EXTENDED := fp-pidbench fpsimd-test fpsimd-stress \
 	rdvl-sve \
 	sve-test sve-stress \
 	vlset
 
 all: $(TEST_GEN_PROGS) $(TEST_PROGS_EXTENDED)
 
+fp-pidbench: fp-pidbench.S asm-utils.o
+	$(CC) -nostdlib $^ -o $@
 fpsimd-test: fpsimd-test.o asm-utils.o
 	$(CC) -nostdlib $^ -o $@
 rdvl-sve: rdvl-sve.o rdvl.o
diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S
new file mode 100644
index 000000000000..16a436389bfc
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021 ARM Limited.
+// Original author: Mark Brown <broonie@kernel.org>
+//
+// Trivial syscall overhead benchmark.
+//
+// This is implemented in asm to ensure that we don't have any issues with
+// system libraries using instructions that disrupt the test.
+
+#include <asm/unistd.h>
+#include "assembler.h"
+
+.arch_extension sve
+
+.macro test_loop per_loop
+	mov	x10, x20
+	mov	x8, #__NR_getpid
+	mrs	x11, CNTVCT_EL0
+1:
+	\per_loop
+	svc	#0
+	sub	x10, x10, #1
+	cbnz	x10, 1b
+
+	mrs	x12, CNTVCT_EL0
+	sub	x0, x12, x11
+	bl	putdec
+	puts	"\n"
+.endm
+
+// Main program entry point
+.globl _start
+function _start
+_start:
+	puts	"Iterations per test: "
+	mov	x20, #10000
+	lsl	x20, x20, #8
+	mov	x0, x20
+	bl	putdec
+	puts	"\n"
+
+	// Test having never used SVE
+	puts	"No SVE: "
+	test_loop
+
+	// Check for SVE support - should use hwcap but that's hard in asm
+	mrs	x0, ID_AA64PFR0_EL1
+	ubfx	x0, x0, #32, #4
+	cbnz	x0, 1f
+	puts	"System does not support SVE\n"
+	b	out
+1:
+
+	// Execute a SVE instruction
+	puts	"SVE VL: "
+	rdvl	x0, #8
+	bl	putdec
+	puts	"\n"
+
+	puts	"SVE used once: "
+	test_loop
+
+	// Use SVE per syscall
+	puts	"SVE used per syscall: "
+	test_loop "rdvl x0, #8"
+
+	//  And we're done
+out:
+	mov	x0, #0
+	mov	x8, #__NR_exit
+	svc	#0
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index c4417bc48d4f..af798b9d232c 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -21,16 +21,37 @@
 
 #include "../../kselftest.h"
 
-#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
-#define FPSIMD_TESTS 5
-
-#define EXPECTED_TESTS (VL_TESTS + FPSIMD_TESTS)
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 
 /* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
 #ifndef NT_ARM_SVE
 #define NT_ARM_SVE 0x405
 #endif
 
+struct vec_type {
+	const char *name;
+	unsigned long hwcap_type;
+	unsigned long hwcap;
+	int regset;
+	int prctl_set;
+};
+
+static const struct vec_type vec_types[] = {
+	{
+		.name = "SVE",
+		.hwcap_type = AT_HWCAP,
+		.hwcap = HWCAP_SVE,
+		.regset = NT_ARM_SVE,
+		.prctl_set = PR_SVE_SET_VL,
+	},
+};
+
+#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
+#define FLAG_TESTS 2
+#define FPSIMD_TESTS 3
+
+#define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types))
+
 static void fill_buf(char *buf, size_t size)
 {
 	int i;
@@ -59,7 +80,8 @@ static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd)
 	return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov);
 }
 
-static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
+static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type,
+				       void **buf, size_t *size)
 {
 	struct user_sve_header *sve;
 	void *p;
@@ -80,7 +102,7 @@ static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
 
 		iov.iov_base = *buf;
 		iov.iov_len = sz;
-		if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov))
+		if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov))
 			goto error;
 
 		sve = *buf;
@@ -96,17 +118,18 @@ error:
 	return NULL;
 }
 
-static int set_sve(pid_t pid, const struct user_sve_header *sve)
+static int set_sve(pid_t pid, const struct vec_type *type,
+		   const struct user_sve_header *sve)
 {
 	struct iovec iov;
 
 	iov.iov_base = (void *)sve;
 	iov.iov_len = sve->size;
-	return ptrace(PTRACE_SETREGSET, pid, NT_ARM_SVE, &iov);
+	return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov);
 }
 
 /* Validate setting and getting the inherit flag */
-static void ptrace_set_get_inherit(pid_t child)
+static void ptrace_set_get_inherit(pid_t child, const struct vec_type *type)
 {
 	struct user_sve_header sve;
 	struct user_sve_header *new_sve = NULL;
@@ -118,9 +141,10 @@ static void ptrace_set_get_inherit(pid_t child)
 	sve.size = sizeof(sve);
 	sve.vl = sve_vl_from_vq(SVE_VQ_MIN);
 	sve.flags = SVE_PT_VL_INHERIT;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set SVE_PT_VL_INHERIT\n");
+		ksft_test_result_fail("Failed to set %s SVE_PT_VL_INHERIT\n",
+				      type->name);
 		return;
 	}
 
@@ -128,35 +152,39 @@ static void ptrace_set_get_inherit(pid_t child)
 	 * Read back the new register state and verify that we have
 	 * set the flags we expected.
 	 */
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read SVE flags\n");
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s SVE flags\n",
+				      type->name);
 		return;
 	}
 
 	ksft_test_result(new_sve->flags & SVE_PT_VL_INHERIT,
-			 "SVE_PT_VL_INHERIT set\n");
+			 "%s SVE_PT_VL_INHERIT set\n", type->name);
 
 	/* Now clear */
 	sve.flags &= ~SVE_PT_VL_INHERIT;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to clear SVE_PT_VL_INHERIT\n");
+		ksft_test_result_fail("Failed to clear %s SVE_PT_VL_INHERIT\n",
+				      type->name);
 		return;
 	}
 
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read SVE flags\n");
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s SVE flags\n",
+				      type->name);
 		return;
 	}
 
 	ksft_test_result(!(new_sve->flags & SVE_PT_VL_INHERIT),
-			 "SVE_PT_VL_INHERIT cleared\n");
+			 "%s SVE_PT_VL_INHERIT cleared\n", type->name);
 
 	free(new_sve);
 }
 
 /* Validate attempting to set the specfied VL via ptrace */
-static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
+static void ptrace_set_get_vl(pid_t child, const struct vec_type *type,
+			      unsigned int vl, bool *supported)
 {
 	struct user_sve_header sve;
 	struct user_sve_header *new_sve = NULL;
@@ -166,10 +194,10 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	*supported = false;
 
 	/* Check if the VL is supported in this process */
-	prctl_vl = prctl(PR_SVE_SET_VL, vl);
+	prctl_vl = prctl(type->prctl_set, vl);
 	if (prctl_vl == -1)
-		ksft_exit_fail_msg("prctl(PR_SVE_SET_VL) failed: %s (%d)\n",
-				   strerror(errno), errno);
+		ksft_exit_fail_msg("prctl(PR_%s_SET_VL) failed: %s (%d)\n",
+				   type->name, strerror(errno), errno);
 
 	/* If the VL is not supported then a supported VL will be returned */
 	*supported = (prctl_vl == vl);
@@ -178,9 +206,10 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	memset(&sve, 0, sizeof(sve));
 	sve.size = sizeof(sve);
 	sve.vl = vl;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u\n",
+				      type->name, vl);
 		return;
 	}
 
@@ -188,12 +217,14 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	 * Read back the new register state and verify that we have the
 	 * same VL that we got from prctl() on ourselves.
 	 */
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read VL %u\n", vl);
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s VL %u\n",
+				      type->name, vl);
 		return;
 	}
 
-	ksft_test_result(new_sve->vl = prctl_vl, "Set VL %u\n", vl);
+	ksft_test_result(new_sve->vl = prctl_vl, "Set %s VL %u\n",
+			 type->name, vl);
 
 	free(new_sve);
 }
@@ -209,7 +240,7 @@ static void check_u32(unsigned int vl, const char *reg,
 }
 
 /* Access the FPSIMD registers via the SVE regset */
-static void ptrace_sve_fpsimd(pid_t child)
+static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type)
 {
 	void *svebuf = NULL;
 	size_t svebufsz = 0;
@@ -219,17 +250,18 @@ static void ptrace_sve_fpsimd(pid_t child)
 	unsigned char *p;
 
 	/* New process should start with FPSIMD registers only */
-	sve = get_sve(child, &svebuf, &svebufsz);
+	sve = get_sve(child, type, &svebuf, &svebufsz);
 	if (!sve) {
-		ksft_test_result_fail("get_sve: %s\n", strerror(errno));
+		ksft_test_result_fail("get_sve(%s): %s\n",
+				      type->name, strerror(errno));
 
 		return;
 	} else {
-		ksft_test_result_pass("get_sve(FPSIMD)\n");
+		ksft_test_result_pass("get_sve(%s FPSIMD)\n", type->name);
 	}
 
 	ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD,
-			 "Set FPSIMD registers\n");
+			 "Set FPSIMD registers via %s\n", type->name);
 	if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD)
 		goto out;
 
@@ -243,9 +275,9 @@ static void ptrace_sve_fpsimd(pid_t child)
 			p[j] = j;
 	}
 
-	if (set_sve(child, sve)) {
-		ksft_test_result_fail("set_sve(FPSIMD): %s\n",
-				      strerror(errno));
+	if (set_sve(child, type, sve)) {
+		ksft_test_result_fail("set_sve(%s FPSIMD): %s\n",
+				      type->name, strerror(errno));
 
 		goto out;
 	}
@@ -257,16 +289,20 @@ static void ptrace_sve_fpsimd(pid_t child)
 		goto out;
 	}
 	if (memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0)
-		ksft_test_result_pass("get_fpsimd() gave same state\n");
+		ksft_test_result_pass("%s get_fpsimd() gave same state\n",
+				      type->name);
 	else
-		ksft_test_result_fail("get_fpsimd() gave different state\n");
+		ksft_test_result_fail("%s get_fpsimd() gave different state\n",
+				      type->name);
 
 out:
 	free(svebuf);
 }
 
 /* Validate attempting to set SVE data and read SVE data */
-static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
+static void ptrace_set_sve_get_sve_data(pid_t child,
+					const struct vec_type *type,
+					unsigned int vl)
 {
 	void *write_buf;
 	void *read_buf = NULL;
@@ -281,8 +317,8 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
-				      data_size, vl);
+		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+				      data_size, type->name, vl);
 		return;
 	}
 	write_sve = write_buf;
@@ -306,23 +342,26 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 
 	/* TODO: Generate a valid FFR pattern */
 
-	ret = set_sve(child, write_sve);
+	ret = set_sve(child, type, write_sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u data\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 
 	/* Read the data back */
-	if (!get_sve(child, (void **)&read_buf, &read_sve_size)) {
-		ksft_test_result_fail("Failed to read VL %u data\n", vl);
+	if (!get_sve(child, type, (void **)&read_buf, &read_sve_size)) {
+		ksft_test_result_fail("Failed to read %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 	read_sve = read_buf;
 
 	/* We might read more data if there's extensions we don't know */
 	if (read_sve->size < write_sve->size) {
-		ksft_test_result_fail("Wrote %d bytes, only read %d\n",
-				      write_sve->size, read_sve->size);
+		ksft_test_result_fail("%s wrote %d bytes, only read %d\n",
+				      type->name, write_sve->size,
+				      read_sve->size);
 		goto out_read;
 	}
 
@@ -349,7 +388,8 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 	check_u32(vl, "FPCR", write_buf + SVE_PT_SVE_FPCR_OFFSET(vq),
 		  read_buf + SVE_PT_SVE_FPCR_OFFSET(vq), &errors);
 
-	ksft_test_result(errors == 0, "Set and get SVE data for VL %u\n", vl);
+	ksft_test_result(errors == 0, "Set and get %s data for VL %u\n",
+			 type->name, vl);
 
 out_read:
 	free(read_buf);
@@ -358,7 +398,9 @@ out:
 }
 
 /* Validate attempting to set SVE data and read SVE data */
-static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
+static void ptrace_set_sve_get_fpsimd_data(pid_t child,
+					   const struct vec_type *type,
+					   unsigned int vl)
 {
 	void *write_buf;
 	struct user_sve_header *write_sve;
@@ -376,8 +418,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
-				      data_size, vl);
+		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+				      data_size, type->name, vl);
 		return;
 	}
 	write_sve = write_buf;
@@ -395,16 +437,17 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	fill_buf(write_buf + SVE_PT_SVE_FPSR_OFFSET(vq), SVE_PT_SVE_FPSR_SIZE);
 	fill_buf(write_buf + SVE_PT_SVE_FPCR_OFFSET(vq), SVE_PT_SVE_FPCR_SIZE);
 
-	ret = set_sve(child, write_sve);
+	ret = set_sve(child, type, write_sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u data\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 
 	/* Read the data back */
 	if (get_fpsimd(child, &fpsimd_state)) {
-		ksft_test_result_fail("Failed to read VL %u FPSIMD data\n",
-				      vl);
+		ksft_test_result_fail("Failed to read %s VL %u FPSIMD data\n",
+				      type->name, vl);
 		goto out;
 	}
 
@@ -419,7 +462,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 		       sizeof(tmp));
 
 		if (tmp != fpsimd_state.vregs[i]) {
-			printf("# Mismatch in FPSIMD for VL %u Z%d\n", vl, i);
+			printf("# Mismatch in FPSIMD for %s VL %u Z%d\n",
+			       type->name, vl, i);
 			errors++;
 		}
 	}
@@ -429,8 +473,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	check_u32(vl, "FPCR", write_buf + SVE_PT_SVE_FPCR_OFFSET(vq),
 		  &fpsimd_state.fpcr, &errors);
 
-	ksft_test_result(errors == 0, "Set and get FPSIMD data for VL %u\n",
-			 vl);
+	ksft_test_result(errors == 0, "Set and get FPSIMD data for %s VL %u\n",
+			 type->name, vl);
 
 out:
 	free(write_buf);
@@ -440,7 +484,7 @@ static int do_parent(pid_t child)
 {
 	int ret = EXIT_FAILURE;
 	pid_t pid;
-	int status;
+	int status, i;
 	siginfo_t si;
 	unsigned int vq, vl;
 	bool vl_supported;
@@ -499,26 +543,47 @@ static int do_parent(pid_t child)
 		}
 	}
 
-	/* FPSIMD via SVE regset */
-	ptrace_sve_fpsimd(child);
-
-	/* prctl() flags */
-	ptrace_set_get_inherit(child);
-
-	/* Step through every possible VQ */
-	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
-		vl = sve_vl_from_vq(vq);
+	for (i = 0; i < ARRAY_SIZE(vec_types); i++) {
+		/* FPSIMD via SVE regset */
+		if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) {
+			ptrace_sve_fpsimd(child, &vec_types[i]);
+		} else {
+			ksft_test_result_skip("%s FPSIMD get via SVE\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s FPSIMD set via SVE\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s set read via FPSIMD\n",
+					      vec_types[i].name);
+		}
 
-		/* First, try to set this vector length */
-		ptrace_set_get_vl(child, vl, &vl_supported);
+		/* prctl() flags */
+		ptrace_set_get_inherit(child, &vec_types[i]);
+
+		/* Step through every possible VQ */
+		for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
+			vl = sve_vl_from_vq(vq);
+
+			/* First, try to set this vector length */
+			if (getauxval(vec_types[i].hwcap_type) &
+			    vec_types[i].hwcap) {
+				ptrace_set_get_vl(child, &vec_types[i], vl,
+						  &vl_supported);
+			} else {
+				ksft_test_result_skip("%s get/set VL %d\n",
+						      vec_types[i].name, vl);
+				vl_supported = false;
+			}
 
-		/* If the VL is supported validate data set/get */
-		if (vl_supported) {
-			ptrace_set_sve_get_sve_data(child, vl);
-			ptrace_set_sve_get_fpsimd_data(child, vl);
-		} else {
-			ksft_test_result_skip("set SVE get SVE for VL %d\n", vl);
-			ksft_test_result_skip("set SVE get FPSIMD for VL %d\n", vl);
+			/* If the VL is supported validate data set/get */
+			if (vl_supported) {
+				ptrace_set_sve_get_sve_data(child, &vec_types[i], vl);
+				ptrace_set_sve_get_fpsimd_data(child, &vec_types[i], vl);
+			} else {
+				ksft_test_result_skip("%s set SVE get SVE for VL %d\n",
+						      vec_types[i].name, vl);
+				ksft_test_result_skip("%s set SVE get FPSIMD for VL %d\n",
+						      vec_types[i].name, vl);
+			}
 		}
 	}
 
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 22722abc9dfa..2f8c23af3b5e 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -310,14 +310,12 @@ int test_setup(struct tdescr *td)
 
 int test_run(struct tdescr *td)
 {
-	if (td->sig_trig) {
-		if (td->trigger)
-			return td->trigger(td);
-		else
-			return default_trigger(td);
-	} else {
+	if (td->trigger)
+		return td->trigger(td);
+	else if (td->sig_trig)
+		return default_trigger(td);
+	else
 		return td->run(td, NULL, NULL);
-	}
 }
 
 void test_result(struct tdescr *td)